#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import linguistic
from Nb_noy_voc import Nb_noy_voc


def words_weights(texts):
	weights = {}
	for i, text in enumerate(texts):
		counts, total = linguistic.count(text, linguistic.word_re_no_numbers)
		
		# Divide the weight of words that are in the db but not in the current document
		for word in set(weights).difference(set(counts)):
			weights[word] /= Nb_noy_voc(word)+1
		
		for word, count in counts.items():
			a = Nb_noy_voc(word)
			if len(word) == 1 or a == 0:
				continue
			weight = count/((a+1)**2*total)*10000
			if word in weights:
				# The word is already in the db, compute the average of both weights
				weights[word] = (weights[word]+weight)/2
			else:
				# Take into account the number of texts already analized
				weights[word] = weight/((i+1)**(1/1.5))
	return weights

def find_stop_words(texts):
	ret = {}
	weights = words_weights(texts)
	for text in texts:
		for w1, w2 in linguistic.iter_word_groups(text, linguistic.word_re_no_numbers, 2, 2):
			w1, w2 = w1.lower(), w2.lower()
			try:
				c1, c2 = weights[w1], weights[w2]
			except KeyError:
				continue
			if c1 > 1 and c1 > c2*1000 and Nb_noy_voc(w1) < 3:
				ret[w1] = w1 in ret and ret[w1]+1 or 1
			if c1 > 1 and c2 > c1*1000 and Nb_noy_voc(w2) < 3:
				ret[w2] = w2 in ret and ret[w2]+1 or 1
	return ret