#!/usr/bin/env python3 # -*- coding: utf-8 -*- # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import re char_re = re.compile('\w') word_re = re.compile(r'\w+') word_re_no_numbers = re.compile(r'[^\d\W]+') def count(text, matching_re): """Returns a dictionary with words as keys and counts (number of occurrences of the word) as values.""" counts = {} i = 0 for match in matching_re.finditer(text): word = match.group(0).lower() if word in counts: counts[word] += 1 else: counts[word] = 1 i += 1 return (counts, i+1) def characteristic_words(text, word_re, limit=50, max_word_size=5): """ Compute the list of words that are in the $limit most frequent ones and are less than or exactly $max_word_size characters long. Returns a dictionary with words as keys and frequencies as values. """ counts, total = count(text, word_re) words_by_frequency = [(word, [count/total]) for word, count in counts.items() if len(word) > 1] words_by_frequency.sort(key=lambda x: (-x[1][0], x[0])) words_by_frequency = set(a[0] for a in words_by_frequency[:limit]) words_by_size = set(word for word in counts if len(word) > 1 and len(word) <= max_word_size) characteristic_words = words_by_frequency.intersection(words_by_size) if len(characteristic_words) < 10: raise ValueError('not enough words are in both the smallest and most frequent ones') cw_total = sum(counts[word] for word in characteristic_words) return {word: [counts[word]/cw_total] for word in characteristic_words} def words_stats(out, text, word_re): counts, total = count(text, word_re) words = list(counts.keys()) print(total, 'occurrences of words') print(len(words), 'different words') print('→ '+str(float(total)/len(words))+' occurrences/different words') words.sort(key=lambda x: (-counts[x], x)) for i, word in enumerate(words): stat = str(counts[word])+' '+str((i+1)*counts[word])+' '+str(len(word))+' '+word out.write(stat+'\n') def chars_stats(out, text, char_re): counts, total = count(text, char_re) chars = list(counts.keys()) print(total, 'occurrences of characters') print(len(chars), 'different characters') print('→ '+str(float(total)/len(chars))+' occurrences/different characters') chars.sort(key=lambda x: (-counts[x], x)) for i, char in enumerate(chars): stat = str(counts[char])+' '+str((i+1)*counts[char])+' '+char out.write(stat+'\n') def iter_word_groups(text, word_re, min_size=2, max_size=5): words = [None]*max_size for match in word_re.finditer(text): words = words[1:] + [match.group(0).lower()] if not words[0]: continue for size in range(min_size, max_size+1): yield words[0:size] def count_word_groups(text, word_re, min_size=2, max_size=5): """Returns a dictionary with word groups (from 2 to 5 consecutive words) as keys and counts (number of occurrences of the group) as values.""" found = set() counts = {} for i, words in enumerate(iter_word_groups(text, word_re, min_size, max_size)): group = ' '.join(words) if group in counts: counts[group] += 1 elif group in found: counts[group] = 2 found.remove(group) else: found.update((group,)) # Remove sub groups for group in sorted(counts, reverse=True): if group not in counts: continue s = group.split() for i in range(2, len(s)): subgroup = ' '.join(s[:i]) if subgroup in counts: if counts[subgroup] == counts[group]: counts.pop(subgroup) else: counts[subgroup] -= counts[group] return (counts, i+1) def word_groups_stats(out, text, word_re): counts, total = count_word_groups(text, word_re) word_groups = list(counts.keys()) print(len(word_groups), 'different word groups') print('→ '+str(float(total)/len(word_groups))+' occurrences/different word groups') word_groups.sort(key=lambda x: (-counts[x], x)) for i, group in enumerate(word_groups): stat = str(counts[group])+'\t'+str((i+1)*counts[group])+'\t'+group out.write(stat+'\n')