#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import argparse
import os.path

from language_guesser import Language_guesser
import htmlutils


def lang_from_filepath(filepath):
	return os.path.basename(filepath).split('.')[-2]

def open_html(filepath):
	return htmlutils.get_content(htmlutils.read_file(filepath))

def open_file(filepath):
	ext = os.path.basename(filepath).split('.')[-1].lower()
	if ext == 'html' or args.force_html:
		return open_html(filepath)
	else:
		return open(filepath, 'r').read()

parser = argparse.ArgumentParser()
parser.add_argument(
	'files', metavar='file',
	nargs='*'
)
parser.add_argument(
	'-d', '--dbpath', metavar='file.db',
	default='languages.db'
)
parser.add_argument(
	'--force-html',
	default=False, action='store_true',
	help='Do not use file extension to determine format but force HTML instead.'
)
parser.add_argument(
	'-l', '--learn', metavar='file.$lang.$ext',
	nargs='+', default=[],
	help='Create or improve the database by analyzing those files.'
)
parser.add_argument(
	'-m', '--multilingual-words',
	default=False, action='store_true',
	help='Search the db for multilingual words.'
)
parser.add_argument(
	'-q', '--quiet',
	default=False, action='store_true'
)
parser.add_argument(
	'-r', '--reset-db',
	default=False, action='store_true',
	help='Delete the db file before starting.'
)
parser.add_argument(
	'-t', '--testing-mode',
	default=False, action='store_true',
	help='Test the accuracy of the guesses by checking the results, the file names need to be of the form "file.$lang.$ext".'
)
args = parser.parse_args()


if args.reset_db:
	if os.path.isfile(args.dbpath):
		os.remove(args.dbpath)

guesser = Language_guesser(args.dbpath)

for filepath in args.learn:
	text = open_html(filepath)
	
	guesser.learn(text, lang_from_filepath(filepath))

if args.learn:
	guesser.save_db()

if args.multilingual_words:
	for word, langs in guesser.search_multilingual_words():
		print(len(langs), word, ' '.join(langs), sep='  ')


if len(args.files) > 0:
	if args.testing_mode:
		counts = [0]*4
		total = 0
		unknown = {}
		for filepath in args.files:
			total += 1
			text = open_html(filepath)
			lang = lang_from_filepath(filepath)
			if args.quiet:
				retcode, results = guesser.guess(text, filepath, answer=lang)
			else:
				retcode, results = guesser.print_guess(text, filepath, answer=lang)
			counts[retcode > 1 and 3 or retcode+1] += 1
			if retcode == 0:
				if lang not in unknown:
					unknown[lang] = 1
				else:
					unknown[lang] += 1
		
		stats = [stat/total for stat in counts]
		print('{:.1%} wrong guesses, {:.1%} unknown, {:.1%} good guesses, {:.1%} uncertain'.format(*stats))
		if len(unknown) > 0:
			unknown = sorted(list(unknown.items()), key=lambda x: (-x[1], x[0]))
			unknown_total = counts[1]
			print('unmatched were:  '+'  '.join('{}:{:.1%}'.format(lang, count/unknown_total) for lang, count in unknown))

	else:
		stats = [0]*3
		total = 0
		for filepath in args.files:
			total += 1
			text = open_html(filepath)
			retcode, results = guesser.print_guess(text, filepath)
			stats[retcode > 1 and 2 or retcode] += 1
			print()
		
		stats = [stat/total for stat in stats]
		print('{:.1%} unknown, {:.1%} guesses, {:.1%} uncertain'.format(*stats))