#!/usr/bin/env python
import argparse
import html
import sys
import sqlite3
import re
import os.path
from subprocess import Popen, PIPE, STDOUT
import unicodedata
from pathlib import Path
def escape_dot(s):
return s.replace('"', '\\"')
def to_title_human(s):
return s.replace('_', ' ')
def to_ns_title_human(ns, title, remove_namespace):
return ns_to_txt(ns, remove_namespace) + to_title_human(title)
def get_bigb_filename(outdir, ns, title, remove_namespace):
return os.path.join(outdir, bigb_title_to_id(ns_to_txt(ns, remove_namespace) + title) + '.bigb')
def strip_accents(s):
"""
https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-normalize-in-a-python-unicode-string/518232#518232
"""
return ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
azre = re.compile('[a-zA-Z0-9-]')
bigb_escapa_re = re.compile('[\\\\[\\]{}<$`]')
def bigb_escape(title):
l = []
for c in title:
if bigb_escapa_re.match(c):
l.append('\\')
l.append(c)
else:
l.append(c)
return ''.join(l)
NORMALIZE_PUNCTUATION_CHARACTER_MAP = {
'%': 'percent',
'&': 'and',
'+': 'plus',
'@': 'at',
}
def bigb_title_to_id_native(title):
l = []
for c in title:
if ord(c) < 128:
if azre.match(c):
l.append(c)
elif c in NORMALIZE_PUNCTUATION_CHARACTER_MAP:
l.append('-')
l.append(NORMALIZE_PUNCTUATION_CHARACTER_MAP[c])
l.append('-')
else:
l.append('-')
else:
if c == '\u2013' or c == '\u2014':
l.append('-')
else:
l.append(c)
return strip_accents(re.sub('^-|-$', '', re.sub('-+', '-', ''.join(l)))).lower()
def bigb_title_to_id(title):
global obb_title_to_id_process
obb_title_to_id_process.stdin.write(f'{title}\n'.encode())
obb_title_to_id_process.stdin.flush()
return obb_title_to_id_process.stdout.readline().decode()[:-1]
NAMESPACE_TO_TEXT = {
0: '',
14: 'Category:',
}
def ns_to_txt(ns, remove_namespace=False):
if remove_namespace:
return ''
else:
if ns in NAMESPACE_TO_TEXT:
return NAMESPACE_TO_TEXT[ns]
else:
return str(ns)
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--depth', type=int)
parser.add_argument('-D', '--depth-per-file', type=int, help='keep up to this depth per file, then interlink child pages beyond. Only supported by certain formats, e.g. bigb (HTML TODO)')
parser.add_argument('-i', '--index', default=False, action='store_true', help='make the (single) input title be a index.EXT file')
parser.add_argument('-M', '--max', type=int, help='max number of categories and pages to produce. To limit output size more precisely.')
parser.add_argument('-m', '--merge-article-and-category', default=False, action='store_true', help='place articles on the corresponding category of the same name if any, and omit the article in that case')
parser.add_argument('-N', '--remove-namespace', default=False, action='store_true', help='remove namespace from the output')
parser.add_argument('-O', '--output-format', action='append', default=[], help='which outputs formats to generate. Can be given multiple times to generate multiple formats.')
parser.add_argument('-o', '--outdir', default='out', help='directory where to place output')
parser.add_argument('-w', '--width', type=int, help='max number of categories and pages to consider. To speed up testing mostly.')
parser.add_argument('db')
parser.add_argument('titles', nargs='+')
args = parser.parse_args()
out_dot = False
out_txt = False
out_html = False
out_bigb = False
for f in args.output_format:
if f == 'dot':
out_dot = True
elif f == 'txt':
out_txt = True
elif f == 'html':
out_html = True
elif f == 'bigb':
out_bigb = True
else:
raise f'Unknown format: "{f}"'
outdir = args.outdir
con = sqlite3.connect(args.db)
cur = con.cursor()
params_str = f'''{'' if args.depth_per_file is None else ' -D' + str(args.depth_per_file)}{'' if args.depth is None else ' -d' + str(args.depth)}{'' if args.max is None else ' -M' + str(args.max)}{' -m' if args.merge_article_and_category is None else ''}{'' if args.width is None else ' -w' + str(args.width)}'''
Path(outdir).mkdir(parents=True, exist_ok=True)
with open(os.path.join(outdir, '.gitignore'), 'w') as gitignore_f:
pass
if out_html:
if not args.index:
with open(os.path.join(outdir, 'index.html'), 'w') as html_index_f:
html_index_f.write(f'''
''')
last_depth = 0
while len(todo):
namespace, title, depth, parent_namespace, parent_title, parent_namespace_file, parent_title_file, childi = todo.pop()
depth_delta = depth - last_depth
if depth_delta <= 0:
repeat_close = -depth_delta + 1
else:
repeat_close = 0
last_depth = depth
if args.merge_article_and_category and \
namespace == 14 and \
cur.execute('''select page_namespace from page where page_namespace = ? and page_title = ?''', (0, title,)).fetchone() is not None:
namespace_eff = 0
else:
namespace_eff = namespace
if args.remove_namespace:
path_last = title
else:
path_last = ns_to_txt(namespace_eff) + title
title_human = to_title_human(path_last)
if out_txt:
out_txt_f.write('{}{} {}\n'.format(' ' * depth, depth, path_last))
if out_html:
out_html_f.write('\n' * repeat_close)
out_html_f.write(f'{html.escape(title_human)}\n')
if out_bigb:
if args.index and parent_title is None:
cur_bigb_f = open(os.path.join(outdir, 'index.bigb'), 'a')
else:
if args.depth_per_file is None:
cur_bigb_f = out_bigb_f
else:
cur_bigb_f = open(get_bigb_filename(outdir, parent_namespace_file, parent_title_file, args.remove_namespace), 'a')
cur_bigb_f.write(f'= {bigb_escape(title_human)}\n')
if parent_title is not None and (args.depth_per_file is None or depth % args.depth_per_file):
cur_bigb_f.write(f'{{parent={bigb_escape(to_ns_title_human(parent_namespace, parent_title, args.remove_namespace)).replace("/", " ")}}}\n')
cur_bigb_f.write(f'{{wiki={bigb_escape(ns_to_txt(namespace_eff) + title)}}}\n')
cur_bigb_f.write(f'\n')
if args.depth_per_file is not None:
cur_bigb_f.close()
if namespace == 14:
ncats = 0
npages = 0
cat_includes = []
page_includes = []
for cur_childi, (child_namespace, child_title, page_is_redirect) in enumerate(cur.execute('''
select page_namespace, page_title, page_is_redirect from categorylinks
inner join page on cl_from = page_id and cl_to = ?
order by page_namespace asc, page_title desc
''', (title,)).fetchall()):
# Some redirects also have categories, it is crazy, e.g.: https://en.wikipedia.org/w/index.php?title=Khatri-Rao_product&action=edit contains:
#``
#REDIRECT [[Khatri–Rao product]] {{R with possibilities}}
#
#[[Category:Matrix theory]]
#``
if not page_is_redirect and (child_namespace == 0 or child_namespace == 14) and not child_title in titles0_set:
# We found a article that has a category with the same name, so we just ignore the article
# and push the category instead to be looped over later.
if args.merge_article_and_category and child_namespace == 0:
if cur.execute('''select page_namespace from page where page_namespace = ? and page_title = ?''', (14, child_title,)).fetchone() is not None:
visited.add((child_namespace, child_title))
child_namespace = 14
if out_bigb:
bigb_id = bigb_title_to_id(ns_to_txt(child_namespace, args.remove_namespace) + child_title)
# Not ideal that the bigb ID repetition changes other outputs as well. But well!!!
bigb_id_repeated = bigb_id in bigb_ids
else:
bigb_id_repeated = False
if (
not (child_namespace, child_title) in visited and
not bigb_id_repeated and
not ( args.depth is not None and depth == args.depth )
):
if bigb_id_repeated:
bigb_ids_repeated.append(bigb_id)
if out_dot:
out_dot_f.write('"{}{}"->"{}{}";\n'.format(ns_to_txt(namespace), escape_dot(title), ns_to_txt(child_namespace), escape_dot(child_title)))
append = False
if child_namespace == 14:
if args.width is None or ncats < args.width:
ncats += 1
append = True
else:
if args.width is None or npages < args.width:
npages += 1
append = True
if append:
visited.add((child_namespace, child_title))
if out_bigb:
bigb_ids.add(bigb_id)
if args.max is not None and n == args.max:
break
n += 1
if args.depth_per_file is not None and ((depth + 1) % args.depth_per_file) == 0:
if out_bigb:
if child_namespace == 14:
cat_includes.append((child_namespace, child_title))
else:
page_includes.append((child_namespace, child_title))
Path.unlink(get_bigb_filename(outdir, child_namespace, child_title, args.remove_namespace), missing_ok=True)
child_parent_namespace_file = child_namespace
child_parent_title_file = child_title
else:
child_parent_namespace_file = namespace
if args.index and parent_title is None:
child_parent_namespace_file = 14
child_parent_title_file = 'index'
else:
child_parent_namespace_file = parent_namespace_file
child_parent_title_file = parent_title_file
print(f'{n} {ns_to_txt(child_namespace)}{child_title} depth={depth + 1} child_parent_title_file={child_parent_title_file}', file=sys.stderr)
todo.append((child_namespace, child_title, depth + 1, namespace, title, child_parent_namespace_file, child_parent_title_file, cur_childi))
if args.width is not None and ncats == args.width and npages == args.width:
break
if out_bigb:
if parent_title_file is None:
cur_bigb_f = out_bigb_f
else:
cur_bigb_f = open(get_bigb_filename(outdir, parent_namespace_file, parent_title_file, args.remove_namespace), 'a')
if cat_includes or page_includes:
cur_bigb_f.write(
''.join(map(
lambda t: f'\\Include[{bigb_title_to_id(ns_to_txt(t[0], args.remove_namespace) + t[1])}]\n',
(list(reversed(cat_includes)) + list(reversed(page_includes))),
)) + '\n'
)
if parent_title_file is not None:
cur_bigb_f.close()
if out_bigb:
if args.depth_per_file is None:
out_bigb_f.close()
if out_txt:
out_txt_f.close()
if out_dot:
out_dot_f.write('}\n')
out_dot_f.close()
if out_html:
out_html_f.write('\n' * last_depth)
out_html_f.write('''
''')
out_html_f.close
if out_bigb:
with open(os.path.join(outdir, 'ids_repeated.tmp'), 'w') as f:
f.write('\n'.join(bigb_ids_repeated))