#!/usr/bin/env python3 # -*- coding: utf-8 -*- # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import argparse import hashlib import itertools import os import re import sys import urllib.request import xml.dom.minidom user_agent = 'is evil and those who rely on it are incompetent' def get_url(url): """Download a URL or retrieve it from "cache" directory (no cache timeout).""" cache_dir = 'cache' h = hashlib.md5(url.encode('utf8')).hexdigest() if not os.path.isdir(cache_dir): os.makedirs(cache_dir) for f in os.walk(cache_dir).__next__()[2]: if f.startswith(h+'_'): return open(cache_dir+'/'+f, 'r', encoding=f.split('_')[1]).read() else: r = urllib.request.FancyURLopener() r.addheader('User-Agent', user_agent) f = r.open(url, 'r+b') charset = f.headers.get_content_charset() b = f.read() open(cache_dir+'/'+h+'_'+charset, 'w+b').write(b) return b.decode(charset) long_page_re = re.compile(']*>[^<]*]*>.+?[^<]*(]*>.+?)[^<]*') def iter_long_pages(wiki, limit=50, offset=0): """Yields a tuple (title, url) for each long page of $wiki (example: en.wikipedia.org) from $offset to $offset+$limit.""" base_url = 'http://'+wiki url = base_url+'/w/index.php?title=Special:LongPages&limit=%i&offset=%i' % (limit, offset) for match in long_page_re.finditer(get_url(url)): a = xml.dom.minidom.parseString(match.group(1)).documentElement yield (a.firstChild.nodeValue, base_url+a.getAttribute('href')) alt_lang_re = re.compile(']*>[^<]*(]*>.+?)[^<]*') def iter_alternative_languages(url): """Yields a tuple (language, title, url) for each alternative language of $url.""" for match in alt_lang_re.finditer(get_url(url)): a = xml.dom.minidom.parseString(match.group(2)).documentElement yield (match.group(1), a.getAttribute('title'), a.getAttribute('href')) class create_set(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): namespace.__dict__[self.dest] = set(values) parser = argparse.ArgumentParser() parser.add_argument('wiki', help='example: el.wikipedia.org') parser.add_argument('language', help='language of the wiki') parser.add_argument('languages', nargs='+', action=create_set, help='other languages you wish the articles to be available in') parser.add_argument('-a', '--ask-for-each', default=False, action='store_true', help='propose download for each match') parser.add_argument('-d', '--destdir', default='.') parser.add_argument('-l', '--ref-lang', default='en', help='language used for the directory names') parser.add_argument('-n', '--non-interactive', dest='interactive', default=sys.stdin.isatty(), action='store_false') args = parser.parse_args() def download_articles(dict_by_lang): ref_title = dict_by_lang[args.ref_lang][0] dest = os.path.join(args.destdir, ref_title) if not os.path.isdir(dest): os.makedirs(dest) print('downloading "'+args.ref_lang+':'+ref_title+'" and its equivalents in '+str(len(dict_by_lang)-1)+' other languages') for lang, infos in dict_by_lang.items(): title, url = infos print('\tdownloading '+lang+':'+title) open(dest+'/'+lang+'.html', 'w', encoding='utf-8').write(get_url(url)) matches = [] i = 0 for title, long_page_url in iter_long_pages(args.wiki): found = {} for alt_lang, alt_title, alt_url in iter_alternative_languages(long_page_url): if alt_lang in args.languages: if alt_lang in found: raise Exception('more than one equivalent in '+alt_lang) found[alt_lang] = (alt_title, alt_url) if len(found) == len(args.languages): print(str(i)+' '+title+': '+long_page_url) for alt_lang, alt in found.items(): print('\t'+alt_lang+':'+alt[0]) found[args.language] = (title, long_page_url) if args.interactive and args.ask_for_each: a = input('Download this article and its equivalents ? [y/N] ') if a.lower() == 'y': download_articles(found) matches.append(found) i += 1 if args.interactive: try: values = input('Enter the ranges of matches you want to download (example: "0 6-9"): ').split() except EOFError: print() exit(0) for value in values: if '-' in value: ints = [int(i) for i in value.split('-')] if len(ints) != 2: raise ValueError('"'+r+'" is not a valid range') for match in matches[ints[0]:ints[1]+1]: download_articles(match) else: download_articles(matches[int(value)])