#!/usr/bin/env python import argparse import html import sys import sqlite3 import re import os.path from subprocess import Popen, PIPE, STDOUT import unicodedata from pathlib import Path def escape_dot(s): return s.replace('"', '\\"') def to_title_human(s): return s.replace('_', ' ') def to_ns_title_human(ns, title, remove_namespace): return ns_to_txt(ns, remove_namespace) + to_title_human(title) def get_bigb_filename(outdir, ns, title, remove_namespace): return os.path.join(outdir, bigb_title_to_id(ns_to_txt(ns, remove_namespace) + title) + '.bigb') def strip_accents(s): """ https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-normalize-in-a-python-unicode-string/518232#518232 """ return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn') azre = re.compile('[a-zA-Z0-9-]') bigb_escapa_re = re.compile('[\\\\[\\]{}<$`]') def bigb_escape(title): l = [] for c in title: if bigb_escapa_re.match(c): l.append('\\') l.append(c) else: l.append(c) return ''.join(l) NORMALIZE_PUNCTUATION_CHARACTER_MAP = { '%': 'percent', '&': 'and', '+': 'plus', '@': 'at', } def bigb_title_to_id_native(title): l = [] for c in title: if ord(c) < 128: if azre.match(c): l.append(c) elif c in NORMALIZE_PUNCTUATION_CHARACTER_MAP: l.append('-') l.append(NORMALIZE_PUNCTUATION_CHARACTER_MAP[c]) l.append('-') else: l.append('-') else: if c == '\u2013' or c == '\u2014': l.append('-') else: l.append(c) return strip_accents(re.sub('^-|-$', '', re.sub('-+', '-', ''.join(l)))).lower() def bigb_title_to_id(title): global obb_title_to_id_process obb_title_to_id_process.stdin.write(f'{title}\n'.encode()) obb_title_to_id_process.stdin.flush() return obb_title_to_id_process.stdout.readline().decode()[:-1] NAMESPACE_TO_TEXT = { 0: '', 14: 'Category:', } def ns_to_txt(ns, remove_namespace=False): if remove_namespace: return '' else: if ns in NAMESPACE_TO_TEXT: return NAMESPACE_TO_TEXT[ns] else: return str(ns) parser = argparse.ArgumentParser() parser.add_argument('-d', '--depth', type=int) parser.add_argument('-D', '--depth-per-file', type=int, help='keep up to this depth per file, then interlink child pages beyond. Only supported by certain formats, e.g. bigb (HTML TODO)') parser.add_argument('-i', '--index', default=False, action='store_true', help='make the (single) input title be a index.EXT file') parser.add_argument('-M', '--max', type=int, help='max number of categories and pages to produce. To limit output size more precisely.') parser.add_argument('-m', '--merge-article-and-category', default=False, action='store_true', help='place articles on the corresponding category of the same name if any, and omit the article in that case') parser.add_argument('-N', '--remove-namespace', default=False, action='store_true', help='remove namespace from the output') parser.add_argument('-O', '--output-format', action='append', default=[], help='which outputs formats to generate. Can be given multiple times to generate multiple formats.') parser.add_argument('-o', '--outdir', default='out', help='directory where to place output') parser.add_argument('-w', '--width', type=int, help='max number of categories and pages to consider. To speed up testing mostly.') parser.add_argument('db') parser.add_argument('titles', nargs='+') args = parser.parse_args() out_dot = False out_txt = False out_html = False out_bigb = False for f in args.output_format: if f == 'dot': out_dot = True elif f == 'txt': out_txt = True elif f == 'html': out_html = True elif f == 'bigb': out_bigb = True else: raise f'Unknown format: "{f}"' outdir = args.outdir con = sqlite3.connect(args.db) cur = con.cursor() params_str = f'''{'' if args.depth_per_file is None else ' -D' + str(args.depth_per_file)}{'' if args.depth is None else ' -d' + str(args.depth)}{'' if args.max is None else ' -M' + str(args.max)}{' -m' if args.merge_article_and_category is None else ''}{'' if args.width is None else ' -w' + str(args.width)}''' Path(outdir).mkdir(parents=True, exist_ok=True) with open(os.path.join(outdir, '.gitignore'), 'w') as gitignore_f: pass if out_html: if not args.index: with open(os.path.join(outdir, 'index.html'), 'w') as html_index_f: html_index_f.write(f''' Wikipedia CatTree

Wikipedia CatTree

Methodology: https://stackoverflow.com/questions/17432254/wikipedia-category-hierarchy-from-dumps/77313490#77313490 Params:{params_str}

''') if out_bigb: gitignore_f.write('out\n') with open(os.path.join(outdir, 'CNAME'), 'w') as f: f.write('''wikibot.ourbigbook.com\n''') obb_title_to_id_process = Popen(['ourbigbook', '--title-to-id'], stdout=PIPE, stdin=PIPE) with open(os.path.join(outdir, 'ourbigbook.json'), 'w') as f: f.write('''{}''') with open(os.path.join(outdir, 'ourbigbook.liquid.html'), 'w') as f: f.write(''' {{ title }}{% unless is_index_article %} - Wikipedia Bot - OurBigBook Docs{% endunless %} {% if is_index_article %} {% endif %} {{ head }}
OurBigBook Wikipedia Bot Documentation
{{ body }}
{{ post_body }} ''') with open(os.path.join(outdir, 'main.scss'), 'w') as f: f.write('''@import 'ourbigbook/ourbigbook.common.scss'; $color: black; $header-background-color: #AA0; body { background-color: $color; font-family: $font-family; header, footer { background-color: $header-background-color; } header { align-items: center; display: flex; /* https://stackoverflow.com/questions/5078239/how-do-i-remove-the-space-between-inline-block-elements */ font-size: 0; margin-bottom: 0.8 * $header-font-size; overflow-x: hidden; white-space: nowrap; a { /* Make buttons occupy the full height of the header bar. * https://stackoverflow.com/questions/28254332/how-to-vertically-center-the-contents-of-a-flexbox-item/28254903#28254903 */ align-items: center; align-self: stretch; display: flex; color: $color; font-size: 32px; font-weight: bold; margin-left: 0; margin-right: 0; padding-left: 10px; &:first-child { padding-left: $toplevel-horizontal-padding-left; } padding-right: 10px; text-decoration: none; &:visited { color: $color; } &:hover { color: $header-background-color; background-color: $color; } &.font-awesome-container { font-weight: normal; padding-left: 5px; padding-right: 5px; } height: 1.2em; img { height: 100%; margin-right: 0.1em; } } } footer { word-wrap: break-word; a { $color: #00C; color: $color; text-decoration: none; &:visited { color: $color; } &:hover { text-decoration: underline; } } padding: 10px $toplevel-horizontal-padding-right 10px $toplevel-horizontal-padding-left; div + div { margin-top: 5px; } } } ''') if not args.index: with open(os.path.join(outdir, 'index.bigb'), 'w') as bigb_index_f: bigb_index_f.write(f'''= OurBigBook Wikipedia Bot Hello! I am a bot that scrapes the category graph from Wikipedia! Methodology: https://docs.ourbigbook.com/wikipedia-bot Params:{params_str} ''') for t in args.titles: bigb_index_f.write(f'\\Include[{bigb_title_to_id(ns_to_txt(14, args.remove_namespace) + t)}]\n') # TODO remove, just for symmetry with other broken files with an extra \n at end. bigb_index_f.write('\n') visited = set() bigb_ids = set() bigb_ids_repeated = [] n = 0 titles0_set = set(args.titles) for title in args.titles: visited.add((14, title)) if args.merge_article_and_category and \ cur.execute('''select page_namespace from page where page_namespace = ? and page_title = ?''', (0, title,)).fetchone() is not None: visited.add((0, title)) if out_bigb: bigb_ids.add(bigb_title_to_id(ns_to_txt(14, args.remove_namespace) + title)) if args.merge_article_and_category: bigb_ids.add(bigb_title_to_id(ns_to_txt(0, args.remove_namespace) + title)) if args.depth_per_file is None or args.index: out_bigb_f = open(get_bigb_filename(outdir, 14, title, args.remove_namespace), 'w') else: for title in args.titles: Path.unlink(get_bigb_filename(outdir, 14, title, args.remove_namespace), missing_ok=True) for title in args.titles: todo = [(14, title, 0, None, None, 14, title, 0)] if args.index: title = 'index' title_human = to_title_human(title) + ' - Wikipedia CatTree' basename = os.path.join(outdir, title) if out_txt: out_txt_f = open(f'{basename}.txt', 'w') if out_dot: out_dot_f = open(f'{basename}.dot', 'w') out_dot_f.write('digraph {\n') if out_html: out_html_f = open(f'{basename}.html', 'w') out_html_f.write(f''' {title_human}

{title_human}

Index

''') last_depth = 0 while len(todo): namespace, title, depth, parent_namespace, parent_title, parent_namespace_file, parent_title_file, childi = todo.pop() depth_delta = depth - last_depth if depth_delta <= 0: repeat_close = -depth_delta + 1 else: repeat_close = 0 last_depth = depth if args.merge_article_and_category and \ namespace == 14 and \ cur.execute('''select page_namespace from page where page_namespace = ? and page_title = ?''', (0, title,)).fetchone() is not None: namespace_eff = 0 else: namespace_eff = namespace if args.remove_namespace: path_last = title else: path_last = ns_to_txt(namespace_eff) + title title_human = to_title_human(path_last) if out_txt: out_txt_f.write('{}{} {}\n'.format(' ' * depth, depth, path_last)) if out_html: out_html_f.write('\n' * repeat_close) out_html_f.write(f'
{html.escape(title_human)}\n') if out_bigb: if args.index and parent_title is None: cur_bigb_f = open(os.path.join(outdir, 'index.bigb'), 'a') else: if args.depth_per_file is None: cur_bigb_f = out_bigb_f else: cur_bigb_f = open(get_bigb_filename(outdir, parent_namespace_file, parent_title_file, args.remove_namespace), 'a') cur_bigb_f.write(f'= {bigb_escape(title_human)}\n') if parent_title is not None and (args.depth_per_file is None or depth % args.depth_per_file): cur_bigb_f.write(f'{{parent={bigb_escape(to_ns_title_human(parent_namespace, parent_title, args.remove_namespace)).replace("/", " ")}}}\n') cur_bigb_f.write(f'{{wiki={bigb_escape(ns_to_txt(namespace_eff) + title)}}}\n') cur_bigb_f.write(f'\n') if args.depth_per_file is not None: cur_bigb_f.close() if namespace == 14: ncats = 0 npages = 0 cat_includes = [] page_includes = [] for cur_childi, (child_namespace, child_title, page_is_redirect) in enumerate(cur.execute(''' select page_namespace, page_title, page_is_redirect from categorylinks inner join page on cl_from = page_id and cl_to = ? order by page_namespace asc, page_title desc ''', (title,)).fetchall()): # Some redirects also have categories, it is crazy, e.g.: https://en.wikipedia.org/w/index.php?title=Khatri-Rao_product&action=edit contains: #`` #REDIRECT [[Khatri–Rao product]] {{R with possibilities}} # #[[Category:Matrix theory]] #`` if not page_is_redirect and (child_namespace == 0 or child_namespace == 14) and not child_title in titles0_set: # We found a article that has a category with the same name, so we just ignore the article # and push the category instead to be looped over later. if args.merge_article_and_category and child_namespace == 0: if cur.execute('''select page_namespace from page where page_namespace = ? and page_title = ?''', (14, child_title,)).fetchone() is not None: visited.add((child_namespace, child_title)) child_namespace = 14 if out_bigb: bigb_id = bigb_title_to_id(ns_to_txt(child_namespace, args.remove_namespace) + child_title) # Not ideal that the bigb ID repetition changes other outputs as well. But well!!! bigb_id_repeated = bigb_id in bigb_ids else: bigb_id_repeated = False if ( not (child_namespace, child_title) in visited and not bigb_id_repeated and not ( args.depth is not None and depth == args.depth ) ): if bigb_id_repeated: bigb_ids_repeated.append(bigb_id) if out_dot: out_dot_f.write('"{}{}"->"{}{}";\n'.format(ns_to_txt(namespace), escape_dot(title), ns_to_txt(child_namespace), escape_dot(child_title))) append = False if child_namespace == 14: if args.width is None or ncats < args.width: ncats += 1 append = True else: if args.width is None or npages < args.width: npages += 1 append = True if append: visited.add((child_namespace, child_title)) if out_bigb: bigb_ids.add(bigb_id) if args.max is not None and n == args.max: break n += 1 if args.depth_per_file is not None and ((depth + 1) % args.depth_per_file) == 0: if out_bigb: if child_namespace == 14: cat_includes.append((child_namespace, child_title)) else: page_includes.append((child_namespace, child_title)) Path.unlink(get_bigb_filename(outdir, child_namespace, child_title, args.remove_namespace), missing_ok=True) child_parent_namespace_file = child_namespace child_parent_title_file = child_title else: child_parent_namespace_file = namespace if args.index and parent_title is None: child_parent_namespace_file = 14 child_parent_title_file = 'index' else: child_parent_namespace_file = parent_namespace_file child_parent_title_file = parent_title_file print(f'{n} {ns_to_txt(child_namespace)}{child_title} depth={depth + 1} child_parent_title_file={child_parent_title_file}', file=sys.stderr) todo.append((child_namespace, child_title, depth + 1, namespace, title, child_parent_namespace_file, child_parent_title_file, cur_childi)) if args.width is not None and ncats == args.width and npages == args.width: break if out_bigb: if parent_title_file is None: cur_bigb_f = out_bigb_f else: cur_bigb_f = open(get_bigb_filename(outdir, parent_namespace_file, parent_title_file, args.remove_namespace), 'a') if cat_includes or page_includes: cur_bigb_f.write( ''.join(map( lambda t: f'\\Include[{bigb_title_to_id(ns_to_txt(t[0], args.remove_namespace) + t[1])}]\n', (list(reversed(cat_includes)) + list(reversed(page_includes))), )) + '\n' ) if parent_title_file is not None: cur_bigb_f.close() if out_bigb: if args.depth_per_file is None: out_bigb_f.close() if out_txt: out_txt_f.close() if out_dot: out_dot_f.write('}\n') out_dot_f.close() if out_html: out_html_f.write('
\n' * last_depth) out_html_f.write(''' ''') out_html_f.close if out_bigb: with open(os.path.join(outdir, 'ids_repeated.tmp'), 'w') as f: f.write('\n'.join(bigb_ids_repeated))