#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
import html.entities
import re
alpha_entity_re = re.compile('&([A-Za-z]+);')
num_entity_re = re.compile('([0-9]+);')
def replace_entities(source_html):
source_html = alpha_entity_re.sub(replace_alpha_entity, source_html)
source_html = num_entity_re.sub(replace_num_entity, source_html)
return source_html
def replace_num_entity(match):
try:
return chr(int(match.group(1)))
except:
return match.group(0)
def replace_alpha_entity(match):
try:
return chr(html.entities.name2codepoint[match.group(1)])
except:
return match.group(0)
cleanup_tags_re = re.compile(r'<(script|select|style).+?\1>', re.I|re.S)
cleanup_comments_re = re.compile('', re.S)
cleanup_links_re = re.compile(']*>https?://[^<]+', re.I)
cleanup_references_re = re.compile('.*?
', re.I|re.S)
cleanup_otherlang_re = re.compile(r'<(span|cite) .*?lang="[^"]{2,}".*?>.*?\1>', re.I|re.S)
cleanup_markup_re = re.compile('<[^>]+>')
cleanup_gt_lt_re = re.compile('(?:>|<)')
cleanup_whitespaces_re = re.compile('\s+')
def get_content(source_html):
source_html = cleanup_tags_re.sub('', source_html)
source_html = cleanup_comments_re.sub(' ', source_html)
source_html = cleanup_links_re.sub(' ', source_html)
source_html = cleanup_references_re.sub(' ', source_html)
source_html = cleanup_otherlang_re.sub(' ', source_html)
source_html = cleanup_markup_re.sub(' ', source_html)
source_html = cleanup_gt_lt_re.sub(' ', source_html)
source_html = replace_entities(source_html)
source_html = cleanup_whitespaces_re.sub(' ', source_html)
return source_html
title_re = re.compile('(.+?)<\/title>', re.I)
def get_title(source_html):
match = re.search(title_re, source_html)
return match and match.group(1) or ''
charset_re = re.compile(b'