#!/usr/bin/env python3 # -*- coding: utf-8 -*- # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import argparse import os.path from language_guesser import Language_guesser import htmlutils def lang_from_filepath(filepath): return os.path.basename(filepath).split('.')[-2] def open_html(filepath): return htmlutils.get_content(htmlutils.read_file(filepath)) def open_file(filepath): ext = os.path.basename(filepath).split('.')[-1].lower() if ext == 'html' or args.force_html: return open_html(filepath) else: return open(filepath, 'r').read() parser = argparse.ArgumentParser() parser.add_argument( 'files', metavar='file', nargs='*' ) parser.add_argument( '-d', '--dbpath', metavar='file.db', default='languages.db' ) parser.add_argument( '--force-html', default=False, action='store_true', help='Do not use file extension to determine format but force HTML instead.' ) parser.add_argument( '-l', '--learn', metavar='file.$lang.$ext', nargs='+', default=[], help='Create or improve the database by analyzing those files.' ) parser.add_argument( '-m', '--multilingual-words', default=False, action='store_true', help='Search the db for multilingual words.' ) parser.add_argument( '-q', '--quiet', default=False, action='store_true' ) parser.add_argument( '-r', '--reset-db', default=False, action='store_true', help='Delete the db file before starting.' ) parser.add_argument( '-t', '--testing-mode', default=False, action='store_true', help='Test the accuracy of the guesses by checking the results, the file names need to be of the form "file.$lang.$ext".' ) args = parser.parse_args() if args.reset_db: if os.path.isfile(args.dbpath): os.remove(args.dbpath) guesser = Language_guesser(args.dbpath) for filepath in args.learn: text = open_html(filepath) guesser.learn(text, lang_from_filepath(filepath)) if args.learn: guesser.save_db() if args.multilingual_words: for word, langs in guesser.search_multilingual_words(): print(len(langs), word, ' '.join(langs), sep=' ') if len(args.files) > 0: if args.testing_mode: counts = [0]*4 total = 0 unknown = {} for filepath in args.files: total += 1 text = open_html(filepath) lang = lang_from_filepath(filepath) if args.quiet: retcode, results = guesser.guess(text, filepath, answer=lang) else: retcode, results = guesser.print_guess(text, filepath, answer=lang) counts[retcode > 1 and 3 or retcode+1] += 1 if retcode == 0: if lang not in unknown: unknown[lang] = 1 else: unknown[lang] += 1 stats = [stat/total for stat in counts] print('{:.1%} wrong guesses, {:.1%} unknown, {:.1%} good guesses, {:.1%} uncertain'.format(*stats)) if len(unknown) > 0: unknown = sorted(list(unknown.items()), key=lambda x: (-x[1], x[0])) unknown_total = counts[1] print('unmatched were: '+' '.join('{}:{:.1%}'.format(lang, count/unknown_total) for lang, count in unknown)) else: stats = [0]*3 total = 0 for filepath in args.files: total += 1 text = open_html(filepath) retcode, results = guesser.print_guess(text, filepath) stats[retcode > 1 and 2 or retcode] += 1 print() stats = [stat/total for stat in stats] print('{:.1%} unknown, {:.1%} guesses, {:.1%} uncertain'.format(*stats))