#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import re


char_re = re.compile('\w')
word_re = re.compile(r'\w+')
word_re_no_numbers = re.compile(r'[^\d\W]+')


def count(text, matching_re):
	"""Returns a dictionary with words as keys and counts (number of occurrences of the word) as values."""
	
	counts = {}
	i = 0
	for match in matching_re.finditer(text):
		word = match.group(0).lower()
		if word in counts:
			counts[word] += 1
		else:
			counts[word] = 1
		i += 1
	return (counts, i+1)


def characteristic_words(text, word_re, limit=50, max_word_size=5):
	"""
	Compute the list of words that are in the $limit most frequent ones and are less than or exactly $max_word_size characters long.
	
	Returns a dictionary with words as keys and frequencies as values.
	"""
	counts, total = count(text, word_re)
	words_by_frequency = [(word, [count/total]) for word, count in counts.items() if len(word) > 1]
	words_by_frequency.sort(key=lambda x: (-x[1][0], x[0]))
	words_by_frequency = set(a[0] for a in words_by_frequency[:limit])
	words_by_size = set(word for word in counts if len(word) > 1 and len(word) <= max_word_size)
	characteristic_words = words_by_frequency.intersection(words_by_size)
	if len(characteristic_words) < 10:
		raise ValueError('not enough words are in both the smallest and most frequent ones')
	cw_total = sum(counts[word] for word in characteristic_words)
	return {word: [counts[word]/cw_total] for word in characteristic_words}


def words_stats(out, text, word_re):
	counts, total = count(text, word_re)
	words = list(counts.keys())
	print(total, 'occurrences of words')
	print(len(words), 'different words')
	print('→ '+str(float(total)/len(words))+' occurrences/different words')
	words.sort(key=lambda x: (-counts[x], x))
	for i, word in enumerate(words):
		stat = str(counts[word])+' '+str((i+1)*counts[word])+' '+str(len(word))+' '+word
		out.write(stat+'\n')


def chars_stats(out, text, char_re):
	counts, total = count(text, char_re)
	chars = list(counts.keys())
	print(total, 'occurrences of characters')
	print(len(chars), 'different characters')
	print('→ '+str(float(total)/len(chars))+' occurrences/different characters')
	chars.sort(key=lambda x: (-counts[x], x))
	for i, char in enumerate(chars):
		stat = str(counts[char])+' '+str((i+1)*counts[char])+' '+char
		out.write(stat+'\n')


def iter_word_groups(text, word_re, min_size=2, max_size=5):
	words = [None]*max_size
	for match in word_re.finditer(text):
		words = words[1:] + [match.group(0).lower()]
		if not words[0]:
			continue
		for size in range(min_size, max_size+1):
			yield words[0:size]


def count_word_groups(text, word_re, min_size=2, max_size=5):
	"""Returns a dictionary with word groups (from 2 to 5 consecutive words) as keys and counts (number of occurrences of the group) as values."""
	
	found = set()
	counts = {}
	for i, words in enumerate(iter_word_groups(text, word_re, min_size, max_size)):
		group = ' '.join(words)
		if group in counts:
			counts[group] += 1
		elif group in found:
			counts[group] = 2
			found.remove(group)
		else:
			found.update((group,))
	
	# Remove sub groups
	for group in sorted(counts, reverse=True):
		if group not in counts:
			continue
		s = group.split()
		for i in range(2, len(s)):
			subgroup = ' '.join(s[:i])
			if subgroup in counts:
				if counts[subgroup] == counts[group]:
					counts.pop(subgroup)
				else:
					counts[subgroup] -= counts[group]
	
	return (counts, i+1)


def word_groups_stats(out, text, word_re):
	counts, total = count_word_groups(text, word_re)
	word_groups = list(counts.keys())
	print(len(word_groups), 'different word groups')
	print('→ '+str(float(total)/len(word_groups))+' occurrences/different word groups')
	word_groups.sort(key=lambda x: (-counts[x], x))
	for i, group in enumerate(word_groups):
		stat = str(counts[group])+'\t'+str((i+1)*counts[group])+'\t'+group
		out.write(stat+'\n')