#!/usr/bin/env python3 # -*- coding: utf-8 -*- # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import argparse from fileutils import * from stop_words import * from terms import * parser = argparse.ArgumentParser() parser.add_argument( 'files', metavar='file', nargs='+' ) parser.add_argument( '--force-html', default=False, action='store_true', help='Do not use file extension to determine format but force HTML instead.' ) args = parser.parse_args() texts = [open_file(filepath) for filepath in args.files] stop_words = find_stop_words(texts) print(' '.join(stop_words)) for filepath, text in zip(args.files, texts): terms = find_repeated_terms(text, stop_words) print() print(filepath) print('\n'.join(str(b)+' '+a for a, b in sorted(terms.items(), key=lambda x: (-x[1], x[0]))))