#!/usr/bin/env python3 # -*- coding: utf-8 -*- # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import linguistic def iter_terms(text, stop_words, min_size=1, max_size=5): term = [[] for i in range(max_size)] searching_stop = False for match in linguistic.word_re_no_numbers.finditer(text): word = match.group(0).lower() stop = word in stop_words or len(word) == 1 if stop == searching_stop: term[-1].append(word) elif len(term[-1]) > 0: searching_stop = not searching_stop if searching_stop: if term[0]: for size in range(min_size, max_size+1, 2): yield term[0:size] term = term[2:] term.append([word]) def find_repeated_terms(text, stop_words): terms = {} # Count terms occurrences for term in iter_terms(text, stop_words): if len(term) == 1 and len(term[0]) == 1: continue term = ' '.join(' '.join(a) for a in term) if term in terms: terms[term] += 1 else: terms[term] = 1 # Remove sub-terms and terms that only appear once for term in sorted(terms, reverse=True): if term not in terms: continue if terms[term] == 1: terms.pop(term) continue s = term.split() for i in range(1, len(s)): subterm = ' '.join(s[:i]) if subterm in terms: if terms[subterm] <= terms[term]: terms.pop(subterm) else: terms[subterm] -= terms[term] return terms