#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
import linguistic
def iter_terms(text, stop_words, min_size=1, max_size=5):
term = [[] for i in range(max_size)]
searching_stop = False
for match in linguistic.word_re_no_numbers.finditer(text):
word = match.group(0).lower()
stop = word in stop_words or len(word) == 1
if stop == searching_stop:
term[-1].append(word)
elif len(term[-1]) > 0:
searching_stop = not searching_stop
if searching_stop:
if term[0]:
for size in range(min_size, max_size+1, 2):
yield term[0:size]
term = term[2:]
term.append([word])
def find_repeated_terms(text, stop_words):
terms = {}
# Count terms occurrences
for term in iter_terms(text, stop_words):
if len(term) == 1 and len(term[0]) == 1:
continue
term = ' '.join(' '.join(a) for a in term)
if term in terms:
terms[term] += 1
else:
terms[term] = 1
# Remove sub-terms and terms that only appear once
for term in sorted(terms, reverse=True):
if term not in terms:
continue
if terms[term] == 1:
terms.pop(term)
continue
s = term.split()
for i in range(1, len(s)):
subterm = ' '.join(s[:i])
if subterm in terms:
if terms[subterm] <= terms[term]:
terms.pop(subterm)
else:
terms[subterm] -= terms[term]
return terms