#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import argparse
import hashlib
import itertools
import os
import re
import sys
import urllib.request
import xml.dom.minidom

user_agent = 'is evil and those who rely on it are incompetent'

def get_url(url):
	"""Download a URL or retrieve it from "cache" directory (no cache timeout)."""
	
	cache_dir = 'cache'
	h = hashlib.md5(url.encode('utf8')).hexdigest()
	if not os.path.isdir(cache_dir):
		os.makedirs(cache_dir)
	for f in os.walk(cache_dir).__next__()[2]:
		if f.startswith(h+'_'):
			return open(cache_dir+'/'+f, 'r', encoding=f.split('_')[1]).read()
	else:
		r = urllib.request.FancyURLopener()
		r.addheader('User-Agent', user_agent)
		f = r.open(url, 'r+b')
		charset = f.headers.get_content_charset()
		b = f.read()
		open(cache_dir+'/'+h+'_'+charset, 'w+b').write(b)
		return b.decode(charset)

long_page_re = re.compile('<li[^>]*>[^<]*<a[^>]*>.+?</a>[^<]*(<a[^>]*>.+?</a>)[^<]*</li>')

def iter_long_pages(wiki, limit=50, offset=0):
	"""Yields a tuple (title, url) for each long page of $wiki (example: en.wikipedia.org) from $offset to $offset+$limit."""
	
	base_url = 'http://'+wiki
	url = base_url+'/w/index.php?title=Special:LongPages&limit=%i&offset=%i' % (limit, offset)
	for match in long_page_re.finditer(get_url(url)):
		a = xml.dom.minidom.parseString(match.group(1)).documentElement
		yield (a.firstChild.nodeValue, base_url+a.getAttribute('href'))

alt_lang_re = re.compile('<li(?<!class=")*[^"]+"interwiki-([a-z]+)"[^>]*>[^<]*(<a[^>]*>.+?</a>)[^<]*</li>')

def iter_alternative_languages(url):
	"""Yields a tuple (language, title, url) for each alternative language of $url."""
	
	for match in alt_lang_re.finditer(get_url(url)):
		a = xml.dom.minidom.parseString(match.group(2)).documentElement
		yield (match.group(1), a.getAttribute('title'), a.getAttribute('href'))


class create_set(argparse.Action):
	def __call__(self, parser, namespace, values, option_string=None):
		namespace.__dict__[self.dest] = set(values)

parser = argparse.ArgumentParser()
parser.add_argument('wiki', help='example: el.wikipedia.org')
parser.add_argument('language', help='language of the wiki')
parser.add_argument('languages', nargs='+', action=create_set, help='other languages you wish the articles to be available in')
parser.add_argument('-a', '--ask-for-each', default=False, action='store_true', help='propose download for each match')
parser.add_argument('-d', '--destdir', default='.')
parser.add_argument('-l', '--ref-lang', default='en', help='language used for the directory names')
parser.add_argument('-n', '--non-interactive', dest='interactive', default=sys.stdin.isatty(), action='store_false')
args = parser.parse_args()


def download_articles(dict_by_lang):
	ref_title = dict_by_lang[args.ref_lang][0]
	dest = os.path.join(args.destdir, ref_title)
	if not os.path.isdir(dest):
		os.makedirs(dest)
	print('downloading "'+args.ref_lang+':'+ref_title+'" and its equivalents in '+str(len(dict_by_lang)-1)+' other languages')
	for lang, infos in dict_by_lang.items():
		title, url = infos
		print('\tdownloading '+lang+':'+title)
		open(dest+'/'+lang+'.html', 'w', encoding='utf-8').write(get_url(url))


matches = []
i = 0
for title, long_page_url in iter_long_pages(args.wiki):
	found = {}
	for alt_lang, alt_title, alt_url in iter_alternative_languages(long_page_url):
		if alt_lang in args.languages:
			if alt_lang in found:
				raise Exception('more than one equivalent in '+alt_lang)
			found[alt_lang] = (alt_title, alt_url)
	if len(found) == len(args.languages):
		print(str(i)+'  '+title+': '+long_page_url)
		for alt_lang, alt in found.items():
			print('\t'+alt_lang+':'+alt[0])
		
		found[args.language] = (title, long_page_url)
		if args.interactive and args.ask_for_each:
			a = input('Download this article and its equivalents ? [y/N] ')
			if a.lower() == 'y':
				download_articles(found)
		matches.append(found)
		i += 1

if args.interactive:
	try:
		values = input('Enter the ranges of matches you want to download (example: "0 6-9"): ').split()
	except EOFError:
		print()
		exit(0)
	
	for value in values:
		if '-' in value:
			ints = [int(i) for i in value.split('-')]
			if len(ints) != 2:
				raise ValueError('"'+r+'" is not a valid range')
			for match in matches[ints[0]:ints[1]+1]:
				download_articles(match)
		else:
			download_articles(matches[int(value)])