#!/usr/bin/env python3 # -*- coding: utf-8 -*- # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import sys import htmlutils import linguistic for filepath in sys.argv[1:]: print('file:', filepath) source_html = htmlutils.read_file(filepath) # get title title = htmlutils.get_title(source_html) if title: print('title: '+title) else: print('no title found') # get content text = htmlutils.get_content(source_html) del source_html linguistic.words_stats(open(filepath+'.words.csv', 'w', encoding='utf-8'), text, linguistic.word_re) linguistic.chars_stats(open(filepath+'.chars.csv', 'w', encoding='utf-8'), text, linguistic.char_re) linguistic.word_groups_stats(open(filepath+'.word_groups.csv', 'w', encoding='utf-8'), text, linguistic.word_re) print()