diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 260e111..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "ae"] - path = ae - url = git://github.com/CyberShadow/ae.git diff --git a/AUTHORS b/AUTHORS index 75c75d6..1f61fc2 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,4 +1,5 @@ This project contains code written by: +Daniel Lenski Robin Green diff --git a/LICENSE b/LICENSE index ea19472..4810b5c 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2010 Vladimir Panteleev and contributors (see the file AUTHORS +Copyright (c) 2010 Daniel Lenski and contributors (see the file AUTHORS for a complete list) This software is provided 'as-is', without any express or implied diff --git a/README.md b/README.md index 066ad43..f2ffcd6 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,22 @@ wp2git ====== -This program allows you to download and convert any Wikipedia article's history to a git repository, for easy browsing and blaming. +This program allows you to download and convert any Wikipedia article's history to a `git` repository, for easy browsing and blaming. ### Usage - $ wp2git Article_name_here + $ wp2git.py article_name -`wp2git` will create a directory, in which a new git repository will be created. +`wp2git` will create a directory, in which a new bare `git` repository will be created. Run `wp2git --help` for more options. ### Requirements -The commands `git` and `curl` should be accessible from `PATH`. +`git` should be accessible from `PATH`. -### Download +The [`mwclient` package](http://github.com/mwclient/mwclient) must be installed (use `pip install mwclient`). -You can find compiled Windows binaries on [files.thecybershadow.net](http://files.thecybershadow.net/wp2git/). +### Entirely based on -### Building - -You will need a [D compiler](http://dlang.org/download.html) to build `wp2git`. - - $ git clone --recursive https://github.com/CyberShadow/wp2git - $ rdmd --build-only wp2git +[CyberShadow's version](http://github.com/CyberShadow/wp2git) written in the D language. diff --git a/ae b/ae deleted file mode 160000 index 1962478..0000000 --- a/ae +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 196247891e84b6b30b54089d4a64163fad6807c3 diff --git a/wp2git.d b/wp2git.d deleted file mode 100644 index 55d85b3..0000000 --- a/wp2git.d +++ /dev/null @@ -1,111 +0,0 @@ -// Written in the D Programming Language (version 2) - -import std.stdio; -import std.process; -import std.stream; -import std.string; -import std.file; -import std.conv; -import std.uri; -import std.getopt; -import std.exception; - -import ae.utils.xmllite; - -int main(string[] args) -{ - string language="en"; - bool usage, noImport, keepHistory; - getopt(args, - "h|help", &usage, - "keep-history", &keepHistory, - "no-import", &noImport, - "language", &language, - ); - - enforce(args.length<=2, "Multiple article name arguments"); - if (args.length == 1 || usage) - { - stderr.writefln("Usage: %s Article_name [OPTION]...", args[0]); - stderr.writefln("Create a git repository with the history of the specified Wikipedia article."); - stderr.writefln("Supported options:"); - stderr.writefln(" -h --help Display this help"); - stderr.writefln(" --keep-history Don't delete history.xml"); - stderr.writefln(" --no-import Don't invoke ``git fast-import'' and only"); - stderr.writefln(" generate the fast-import data"); - stderr.writefln(" --language LANG Specify the Wikipedia language subdomain (default: en)"); - return 2; - } - - enforce(args.length==2, "No article specified"); - auto name = args[1]; - auto fn = sanitizeFn(name); - - if (!exists(fn)) - mkdir(fn); - chdir(fn); - - if (!exists("history.xml")) - enforce(spawnvp(P_WAIT, "curl", ["curl", "-d", "\"\"", "http://" ~ language ~ ".wikipedia.org/w/index.php?title=Special:Export&pages=" ~ encodeComponent(name), "-o", "history.xml"])==0, "curl error"); - - stderr.writefln("Loading history..."); - string xmldata = cast(string) read("history.xml"); - if (!keepHistory) std.file.remove("history.xml"); - auto xml = new XmlDocument(xmldata); - - string data = "reset refs/heads/master\n"; - auto page = xml[0]["page"]; - enforce(page, "No such page"); - foreach (child; page) - if (child.tag=="revision") - { - string id = child["id"].text; - string summary = child.findChild("comment") ? child["comment"].text : null; - string committer = child["contributor"].findChild("username") ? child["contributor"]["username"].text : child["contributor"]["ip"].text; - string text = child["text"].text; - stderr.writefln(" >> Revision %s by %s: %s", id, committer, summary); - - summary ~= "\n\nhttp://" ~ language ~ ".wikipedia.org/w/index.php?oldid=" ~ id; - data ~= - "commit refs/heads/master\n" ~ - "committer " ~ committer ~ " <" ~ committer ~ "@" ~ language ~ ".wikipedia.org> " ~ ISO8601toRFC2822(child["timestamp"].text) ~ "\n" ~ - "data " ~ to!string(summary.length) ~ "\n" ~ - summary ~ "\n" ~ - "M 644 inline " ~ fn ~ ".mw\n" ~ - "data " ~ to!string(text.length) ~ "\n" ~ - text ~ "\n" ~ - "\n"; - } - std.file.write("fast-import-data", data); - - if (noImport) - return 0; - - enforce(!exists(".git"), "A git repository already exists here!"); - - system("git init"); - system("git fast-import --date-format=rfc2822 < fast-import-data"); - std.file.remove("fast-import-data"); - system("git reset --hard"); - - return 0; -} - -string ISO8601toRFC2822(string s) -{ - const monthNames = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]; - - // 2010-06-15T19:28:44Z - // Feb 6 11:22:18 2007 -0500 - return monthNames[.to!int(s[5..7])-1] ~ " " ~ s[8..10] ~ " " ~ s[11..13] ~ ":" ~ s[14..16] ~ ":" ~ s[17..19] ~ " " ~ s[0..4] ~ " +0000"; -} - -string sanitizeFn(string s) -{ - static const string forbidden = `?*<>|:\/"`; - auto copy = s.dup; - foreach (ref c; copy) - if (forbidden.indexOf(c) >= 0) - c = '_'; - return assumeUnique(copy); -} diff --git a/wp2git.py b/wp2git.py new file mode 100755 index 0000000..e269045 --- /dev/null +++ b/wp2git.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python2 +from __future__ import print_function + +from sys import stderr, stdout +import argparse +import mwclient +import subprocess as sp +import os, locale, time + +lang = locale.getdefaultlocale()[0].split('_')[0] or '' + +def sanitize(s): + forbidden = r'?*<>|:\/"' + for c in forbidden: + s = s.replace(c, '_') + return s + +def parse_args(): + p = argparse.ArgumentParser(description='Create a git repository with the history of the specified Wikipedia article.') + p.add_argument('article_name') + p.add_argument('--no-import', dest='doimport', default=True, action='store_false', + help="Don't invoke git fast-import and only generate the fast-import data") + p.add_argument('-o','--outdir', help='Output directory') + g=p.add_mutually_exclusive_group() + g.add_argument('--lang', default=lang, help='Wikipedia language code (default %(default)s)') + g.add_argument('--site', help='Alternate site (e.g. commons.wikimedia.org)') + return p, p.parse_args() + +def main(): + p, args = parse_args() + + # Connect to site with mwclient + if args.site is not None: + s = args.site + elif args.lang is not None: + s = '%s.wikipedia.org' % args.lang + else: + s = 'wikipedia.org' + + site = mwclient.Site(s) + print('Connected to site %s.' % s, file=stderr) + + # Find the page + page = site.pages[args.article_name] + if not page.exists: + p.error('Page %s does not exist' % s) + + # Create output directory + fn = sanitize(args.article_name) + if args.outdir is not None: + path = args.outdir + else: + path = fn + + if os.path.exists(path): + p.error('Path %s exists' % path) + os.mkdir(path) + os.chdir(path) + + # Create fast-import data stream + with open('fast-import-data', 'w+b') as fid: + fid.write('reset refs/heads/master\n') + for rev in page.revisions(dir='newer', prop='ids|timestamp|flags|comment|user|content'): + id = rev['revid'] + text = rev.get('*','').encode('utf8') + committer = '%s@%s' % (rev['user'].encode('utf8'), site.host) + ts = time.mktime(rev['timestamp']) + print(" >> Revision %d by %s at %s: %s" % (id, rev['user'], rev['comment'], time.ctime(ts)), file=stderr) + + summary = '%s\n\nURL: http://%s%sindex.php?oldid=%d' % (rev['comment'].encode('utf8') or '', site.host, site.path, id) + + fid.write('commit refs/heads/master\n') + fid.write('committer <%s> %d +0000\n' % (committer, ts)) + fid.write('data %d\n%s\n' % (len(summary), summary)) + fid.write('M 644 inline %s.mw\n' % fn) + fid.write('data %d\n%s\n' % (len(text), text)) + fid.write('done\n') + + if args.doimport: + sp.check_call(['git','init','--bare']) + fid.seek(0, 0) + sp.check_call(['git', 'fast-import','--quiet'], stdin=fid) + + if args.doimport: + os.unlink('fast-import-data') + +if __name__=='__main__': + main()