#!/usr/bin/env python3 # -*- coding: utf-8 -*- # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import html.entities import re alpha_entity_re = re.compile('&([A-Za-z]+);') num_entity_re = re.compile('&#([0-9]+);') def replace_entities(source_html): source_html = alpha_entity_re.sub(replace_alpha_entity, source_html) source_html = num_entity_re.sub(replace_num_entity, source_html) return source_html def replace_num_entity(match): try: return chr(int(match.group(1))) except: return match.group(0) def replace_alpha_entity(match): try: return chr(html.entities.name2codepoint[match.group(1)]) except: return match.group(0) cleanup_tags_re = re.compile(r'<(script|select|style).+?', re.I|re.S) cleanup_comments_re = re.compile('', re.S) cleanup_links_re = re.compile(']*>https?://[^<]+', re.I) cleanup_references_re = re.compile('
    .*?
', re.I|re.S) cleanup_otherlang_re = re.compile(r'<(span|cite) .*?lang="[^"]{2,}".*?>.*?', re.I|re.S) cleanup_markup_re = re.compile('<[^>]+>') cleanup_gt_lt_re = re.compile('(?:>|<)') cleanup_whitespaces_re = re.compile('\s+') def get_content(source_html): source_html = cleanup_tags_re.sub('', source_html) source_html = cleanup_comments_re.sub(' ', source_html) source_html = cleanup_links_re.sub(' ', source_html) source_html = cleanup_references_re.sub(' ', source_html) source_html = cleanup_otherlang_re.sub(' ', source_html) source_html = cleanup_markup_re.sub(' ', source_html) source_html = cleanup_gt_lt_re.sub(' ', source_html) source_html = replace_entities(source_html) source_html = cleanup_whitespaces_re.sub(' ', source_html) return source_html title_re = re.compile('(.+?)<\/title>', re.I) def get_title(source_html): match = re.search(title_re, source_html) return match and match.group(1) or '' charset_re = re.compile(b'<meta http-equiv=["\']Content-Type["\']\s*content=["\']text/html; ?charset=(.+?)["\']', re.I) def read_file(filepath): """Reads and decodes an HTML file using the meta tag to determine encoding.""" # read as bytes source_html = open(filepath, 'r+b').read() # try to decode encoding = 'iso-8859-1' match = charset_re.search(source_html) if match: encoding = str(match.group(1), 'ascii') try: return source_html.decode(encoding) except UnicodeDecodeError: raise UnicodeDecodeError('could not determine file encoding')