#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import os.path

import linguistic


class Language_guesser:
	
	def __init__(self, dbpath):
		self.dbpath = dbpath
		if os.path.isfile(dbpath):
			self.db = {
				lines[0]: # first line of each block is the language code
					{
						values[0]: # first value is the word
							[float(v) for v in values[1:]] # other values are the stats
							for values in (line[:-1].split('\t') for line in lines[1:]) # read values delimited by \t for each line
					}
					for lines in (block.split('\n') for block in open(dbpath, 'r').read().split('\n\n'))
			}
		else:
			self.db = {}
	
	
	def compute_likeness(self, characteristic_words, lang):
		if lang not in self.db:
			raise KeyError('"'+lang+'" language is not in the database')
		matching_words = 0
		db = self.db[lang]
		for word in db:
			if word in characteristic_words:
				matching_words += 1
		return matching_words/len(db)
	
	
	def learn(self, text, lang):
		"""Adds a language to the database or improves the stats of an existing one."""
		
		limit = 15
		
		try:
			characteristic_words = linguistic.characteristic_words(text, limit=limit)
		except ValueError:
			print('uninteresting article, skipping')
			return
		if lang not in self.db:
			self.db[lang] = characteristic_words
		else:
			db = self.db[lang]
			
			# Divide the frequency of words that are in the db but not in the current document
			# This aims at removing topic-specific words from the db
			for word in set(db).difference(set(characteristic_words)):
				db[word] = [value/(2*len(word)) for value in db[word]]
			
			min_word = None
			for word, values in characteristic_words.items():
				if word not in db:
					freq = values[0]/(2*len(word))
					if len(db) < limit:
						# The database is not full, just add the word
						db[word] = [freq]
					else:
						# Replace the word with the lowest frequency if current frequency is higher
						if not min_word:
							min_word, min_freq = min(((word, stats[0]) for word, stats in db.items()), key=lambda x: x[1])
						if freq > min_freq:
							db.pop(min_word)
							db[word] = values
							min_word, min_freq = word, freq
				else:
					# The word is already in the db, compute the average of both frequencies
					db[word] = [(db[word][i]+values[i])/2 for i in range(len(values))]
	
	
	def guess(self, text, filepath, answer=None):
		"""
		Tries to guess the language of $text.
		
		$answer is for testing mode. It contains the real language code and is compared with the guess.
		
		Returns a tuple (retcode, results).
		$retcode is the number of matches:
			* -1 for wrong guess (in testing mode only)
			* 0 for if all probabilities are under 50%
			* 1 if there is a unique match
			* more if there are close matches
		$results is a list of tuples (langcode, probability)
		"""
		
		characteristic_words = linguistic.characteristic_words(text)
		results = [(lang, self.compute_likeness(characteristic_words, lang)) for lang in self.db]
		results.sort(reverse=True, key=lambda x: x[1])
		
		prev = results[0]
		
		if prev[1] < 0.5:
			# All probabilities are under 50%, possible causes:
			#   - the language of the text is not in the db
			#   - the proportion of "prose" in the document is low (there are very few characteristic words in a table for example)
			return (0, results)
		
		i = 1
		for cur in results[1:]:
			if cur[1] == 0:
				break
			distance = prev[1] - cur[1]
			average = (prev[1] + cur[1])/2
			coeff = distance / (0.5 + average)
			if coeff > 0.2:
				break
			prev = cur
			i += 1
		
		if i == 1:
			if answer and results[0][0] != answer:
				return (-1, results)
			else:
				return (1, results)
		else:
			return (i, results)
	
	
	def print_guess(self, text, filepath, answer=None):
		"""
		Frontend to Language_guesser.guess(). Same arguments and same return value.
		"""
		
		retcode, results = self.guess(text, filepath, answer)
		
		if retcode != 1 or not answer:
			print('file:', filepath)
		
		if retcode == -1:
			print('wrong guess: {} with a score of {:.1%}'.format(*results[0]))
			print(results[1:])
			print()
			return (retcode, results)
		
		elif retcode == 0:
			print('no language in the database matches, closest is {} with a score of {:.1%}'.format(*results[0]))
		
		elif retcode == 1:
			if not answer:
				print('the language of the text probably is {} (sure at {:.1%})'.format(*results[0]))
			else:
				return (retcode, results)
		
		else:
			print(len(results), 'languages match closely:')
			for r in results[:retcode]:
				print('{} {:.1%}'.format(*r))
		
		print()
		return (retcode, results)
	
	
	def save_db(self, dbpath=None):
		"""Saves the database to dbpath."""
		
		dbpath = dbpath or self.dbpath
		dbfile = open(dbpath, 'w')
		dbfile.write(
			'\n\n'.join( # join blocks
				lang+'\n'+'\n'.join( # join lines
					word+'\t'+'\t'.join('%.4f' % value for value in values) # join values
					for word, values in sorted([stat for stat in stats.items()], key=lambda x: (-x[1][0], x[0]))
				)
				for lang, stats in self.db.items()
			)
		)
	
	
	def search_multilingual_words(self):
		"""
		Search the db for multilingual words.
		
		Returns a list of tuples (word, langs).
		"""
		
		if len(self.db) == 0:
			print('db is empty')
			return
		
		found = {}
		for lang, words in self.db.items():
			for word, values in words.items():
				if word in found:
					found[word].update([lang])
				else:
					found[word] = set([lang])
		
		multilingual_words = [(word, langs) for word, langs in found.items() if len(langs) > 1]
		multilingual_words.sort(key=lambda x: (-len(x[1]), x[0]))
		return multilingual_words