#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
import argparse
import hashlib
import itertools
import os
import re
import sys
import urllib.request
import xml.dom.minidom
user_agent = 'is evil and those who rely on it are incompetent'
def get_url(url):
"""Download a URL or retrieve it from "cache" directory (no cache timeout)."""
cache_dir = 'cache'
h = hashlib.md5(url.encode('utf8')).hexdigest()
if not os.path.isdir(cache_dir):
os.makedirs(cache_dir)
for f in os.walk(cache_dir).__next__()[2]:
if f.startswith(h+'_'):
return open(cache_dir+'/'+f, 'r', encoding=f.split('_')[1]).read()
else:
r = urllib.request.FancyURLopener()
r.addheader('User-Agent', user_agent)
f = r.open(url, 'r+b')
charset = f.headers.get_content_charset()
b = f.read()
open(cache_dir+'/'+h+'_'+charset, 'w+b').write(b)
return b.decode(charset)
long_page_re = re.compile('
]*>[^<]*]*>.+?[^<]*(]*>.+?)[^<]*')
def iter_long_pages(wiki, limit=50, offset=0):
"""Yields a tuple (title, url) for each long page of $wiki (example: en.wikipedia.org) from $offset to $offset+$limit."""
base_url = 'http://'+wiki
url = base_url+'/w/index.php?title=Special:LongPages&limit=%i&offset=%i' % (limit, offset)
for match in long_page_re.finditer(get_url(url)):
a = xml.dom.minidom.parseString(match.group(1)).documentElement
yield (a.firstChild.nodeValue, base_url+a.getAttribute('href'))
alt_lang_re = re.compile(']*>[^<]*(]*>.+?)[^<]*')
def iter_alternative_languages(url):
"""Yields a tuple (language, title, url) for each alternative language of $url."""
for match in alt_lang_re.finditer(get_url(url)):
a = xml.dom.minidom.parseString(match.group(2)).documentElement
yield (match.group(1), a.getAttribute('title'), a.getAttribute('href'))
class create_set(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
namespace.__dict__[self.dest] = set(values)
parser = argparse.ArgumentParser()
parser.add_argument('wiki', help='example: el.wikipedia.org')
parser.add_argument('language', help='language of the wiki')
parser.add_argument('languages', nargs='+', action=create_set, help='other languages you wish the articles to be available in')
parser.add_argument('-a', '--ask-for-each', default=False, action='store_true', help='propose download for each match')
parser.add_argument('-d', '--destdir', default='.')
parser.add_argument('-l', '--ref-lang', default='en', help='language used for the directory names')
parser.add_argument('-n', '--non-interactive', dest='interactive', default=sys.stdin.isatty(), action='store_false')
args = parser.parse_args()
def download_articles(dict_by_lang):
ref_title = dict_by_lang[args.ref_lang][0]
dest = os.path.join(args.destdir, ref_title)
if not os.path.isdir(dest):
os.makedirs(dest)
print('downloading "'+args.ref_lang+':'+ref_title+'" and its equivalents in '+str(len(dict_by_lang)-1)+' other languages')
for lang, infos in dict_by_lang.items():
title, url = infos
print('\tdownloading '+lang+':'+title)
open(dest+'/'+lang+'.html', 'w', encoding='utf-8').write(get_url(url))
matches = []
i = 0
for title, long_page_url in iter_long_pages(args.wiki):
found = {}
for alt_lang, alt_title, alt_url in iter_alternative_languages(long_page_url):
if alt_lang in args.languages:
if alt_lang in found:
raise Exception('more than one equivalent in '+alt_lang)
found[alt_lang] = (alt_title, alt_url)
if len(found) == len(args.languages):
print(str(i)+' '+title+': '+long_page_url)
for alt_lang, alt in found.items():
print('\t'+alt_lang+':'+alt[0])
found[args.language] = (title, long_page_url)
if args.interactive and args.ask_for_each:
a = input('Download this article and its equivalents ? [y/N] ')
if a.lower() == 'y':
download_articles(found)
matches.append(found)
i += 1
if args.interactive:
try:
values = input('Enter the ranges of matches you want to download (example: "0 6-9"): ').split()
except EOFError:
print()
exit(0)
for value in values:
if '-' in value:
ints = [int(i) for i in value.split('-')]
if len(ints) != 2:
raise ValueError('"'+r+'" is not a valid range')
for match in matches[ints[0]:ints[1]+1]:
download_articles(match)
else:
download_articles(matches[int(value)])