#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import linguistic


def iter_terms(text, stop_words, min_size=1, max_size=5):
	term = [[] for i in range(max_size)]
	searching_stop = False
	for match in linguistic.word_re_no_numbers.finditer(text):
		word = match.group(0).lower()
		stop = word in stop_words or len(word) == 1
		if stop == searching_stop:
			term[-1].append(word)
		elif len(term[-1]) > 0:
			searching_stop = not searching_stop
			if searching_stop:
				if term[0]:
					for size in range(min_size, max_size+1, 2):
						yield term[0:size]
				term = term[2:]
			term.append([word])

def find_repeated_terms(text, stop_words):
	terms = {}
	
	# Count terms occurrences
	for term in iter_terms(text, stop_words):
		if len(term) == 1 and len(term[0]) == 1:
			continue
		term = ' '.join(' '.join(a) for a in term)
		if term in terms:
			terms[term] += 1
		else:
			terms[term] = 1
	
	# Remove sub-terms and terms that only appear once
	for term in sorted(terms, reverse=True):
		if term not in terms:
			continue
		if terms[term] == 1:
			terms.pop(term)
			continue
		s = term.split()
		for i in range(1, len(s)):
			subterm = ' '.join(s[:i])
			if subterm in terms:
				if terms[subterm] <= terms[term]:
					terms.pop(subterm)
				else:
					terms[subterm] -= terms[term]
	
	return terms