#!/usr/bin/env python3 # -*- coding: utf-8 -*- # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import linguistic from Nb_noy_voc import Nb_noy_voc def words_weights(texts): weights = {} for i, text in enumerate(texts): counts, total = linguistic.count(text, linguistic.word_re_no_numbers) # Divide the weight of words that are in the db but not in the current document for word in set(weights).difference(set(counts)): weights[word] /= Nb_noy_voc(word)+1 for word, count in counts.items(): a = Nb_noy_voc(word) if len(word) == 1 or a == 0: continue weight = count/((a+1)**2*total)*10000 if word in weights: # The word is already in the db, compute the average of both weights weights[word] = (weights[word]+weight)/2 else: # Take into account the number of texts already analized weights[word] = weight/((i+1)**(1/1.5)) return weights def find_stop_words(texts): ret = {} weights = words_weights(texts) for text in texts: for w1, w2 in linguistic.iter_word_groups(text, linguistic.word_re_no_numbers, 2, 2): w1, w2 = w1.lower(), w2.lower() try: c1, c2 = weights[w1], weights[w2] except KeyError: continue if c1 > 1 and c1 > c2*1000 and Nb_noy_voc(w1) < 3: ret[w1] = w1 in ret and ret[w1]+1 or 1 if c1 > 1 and c2 > c1*1000 and Nb_noy_voc(w2) < 3: ret[w2] = w2 in ret and ret[w2]+1 or 1 return ret