#!/usr/bin/env python3 # -*- coding: utf-8 -*- # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import os.path import linguistic class Language_guesser: def __init__(self, dbpath): self.dbpath = dbpath if os.path.isfile(dbpath): self.db = { lines[0]: # first line of each block is the language code { values[0]: # first value is the word [float(v) for v in values[1:]] # other values are the stats for values in (line[:-1].split('\t') for line in lines[1:]) # read values delimited by \t for each line } for lines in (block.split('\n') for block in open(dbpath, 'r').read().split('\n\n')) } else: self.db = {} def compute_likeness(self, characteristic_words, lang): if lang not in self.db: raise KeyError('"'+lang+'" language is not in the database') matching_words = 0 db = self.db[lang] for word in db: if word in characteristic_words: matching_words += 1 return matching_words/len(db) def learn(self, text, lang): """Adds a language to the database or improves the stats of an existing one.""" limit = 15 try: characteristic_words = linguistic.characteristic_words(text, limit=limit) except ValueError: print('uninteresting article, skipping') return if lang not in self.db: self.db[lang] = characteristic_words else: db = self.db[lang] # Divide the frequency of words that are in the db but not in the current document # This aims at removing topic-specific words from the db for word in set(db).difference(set(characteristic_words)): db[word] = [value/(2*len(word)) for value in db[word]] min_word = None for word, values in characteristic_words.items(): if word not in db: freq = values[0]/(2*len(word)) if len(db) < limit: # The database is not full, just add the word db[word] = [freq] else: # Replace the word with the lowest frequency if current frequency is higher if not min_word: min_word, min_freq = min(((word, stats[0]) for word, stats in db.items()), key=lambda x: x[1]) if freq > min_freq: db.pop(min_word) db[word] = values min_word, min_freq = word, freq else: # The word is already in the db, compute the average of both frequencies db[word] = [(db[word][i]+values[i])/2 for i in range(len(values))] def guess(self, text, filepath, answer=None): """ Tries to guess the language of $text. $answer is for testing mode. It contains the real language code and is compared with the guess. Returns a tuple (retcode, results). $retcode is the number of matches: * -1 for wrong guess (in testing mode only) * 0 for if all probabilities are under 50% * 1 if there is a unique match * more if there are close matches $results is a list of tuples (langcode, probability) """ characteristic_words = linguistic.characteristic_words(text) results = [(lang, self.compute_likeness(characteristic_words, lang)) for lang in self.db] results.sort(reverse=True, key=lambda x: x[1]) prev = results[0] if prev[1] < 0.5: # All probabilities are under 50%, possible causes: # - the language of the text is not in the db # - the proportion of "prose" in the document is low (there are very few characteristic words in a table for example) return (0, results) i = 1 for cur in results[1:]: if cur[1] == 0: break distance = prev[1] - cur[1] average = (prev[1] + cur[1])/2 coeff = distance / (0.5 + average) if coeff > 0.2: break prev = cur i += 1 if i == 1: if answer and results[0][0] != answer: return (-1, results) else: return (1, results) else: return (i, results) def print_guess(self, text, filepath, answer=None): """ Frontend to Language_guesser.guess(). Same arguments and same return value. """ retcode, results = self.guess(text, filepath, answer) if retcode != 1 or not answer: print('file:', filepath) if retcode == -1: print('wrong guess: {} with a score of {:.1%}'.format(*results[0])) print(results[1:]) print() return (retcode, results) elif retcode == 0: print('no language in the database matches, closest is {} with a score of {:.1%}'.format(*results[0])) elif retcode == 1: if not answer: print('the language of the text probably is {} (sure at {:.1%})'.format(*results[0])) else: return (retcode, results) else: print(len(results), 'languages match closely:') for r in results[:retcode]: print('{} {:.1%}'.format(*r)) print() return (retcode, results) def save_db(self, dbpath=None): """Saves the database to dbpath.""" dbpath = dbpath or self.dbpath dbfile = open(dbpath, 'w') dbfile.write( '\n\n'.join( # join blocks lang+'\n'+'\n'.join( # join lines word+'\t'+'\t'.join('%.4f' % value for value in values) # join values for word, values in sorted([stat for stat in stats.items()], key=lambda x: (-x[1][0], x[0])) ) for lang, stats in self.db.items() ) ) def search_multilingual_words(self): """ Search the db for multilingual words. Returns a list of tuples (word, langs). """ if len(self.db) == 0: print('db is empty') return found = {} for lang, words in self.db.items(): for word, values in words.items(): if word in found: found[word].update([lang]) else: found[word] = set([lang]) multilingual_words = [(word, langs) for word, langs in found.items() if len(langs) > 1] multilingual_words.sort(key=lambda x: (-len(x[1]), x[0])) return multilingual_words