import linguistic
def iter_terms(text, stop_words, min_size=1, max_size=5):
term = [[] for i in range(max_size)]
searching_stop = False
for match in linguistic.word_re_no_numbers.finditer(text):
word = match.group(0).lower()
stop = word in stop_words or len(word) == 1
if stop == searching_stop:
term[-1].append(word)
elif len(term[-1]) > 0:
searching_stop = not searching_stop
if searching_stop:
if term[0]:
for size in range(min_size, max_size+1, 2):
yield term[0:size]
term = term[2:]
term.append([word])
def find_repeated_terms(text, stop_words):
terms = {}
# Count terms occurrences
for term in iter_terms(text, stop_words):
if len(term) == 1 and len(term[0]) == 1:
continue
term = ' '.join(' '.join(a) for a in term)
if term in terms:
terms[term] += 1
else:
terms[term] = 1
# Remove sub-terms and terms that only appear once
for term in sorted(terms, reverse=True):
if term not in terms:
continue
if terms[term] == 1:
terms.pop(term)
continue
s = term.split()
for i in range(1, len(s)):
subterm = ' '.join(s[:i])
if subterm in terms:
if terms[subterm] <= terms[term]:
terms.pop(subterm)
else:
terms[subterm] -= terms[term]
return terms