replace D version with Python version, based on mwclient
This commit is contained in:
parent
2fa911799e
commit
aa22ced399
|
@ -1,3 +0,0 @@
|
||||||
[submodule "ae"]
|
|
||||||
path = ae
|
|
||||||
url = git://github.com/CyberShadow/ae.git
|
|
1
AUTHORS
1
AUTHORS
|
@ -1,4 +1,5 @@
|
||||||
This project contains code written by:
|
This project contains code written by:
|
||||||
|
|
||||||
|
Daniel Lenski <dlenski@gmail.com
|
||||||
Vladimir Panteleev <vladimir@thecybershadow.net>
|
Vladimir Panteleev <vladimir@thecybershadow.net>
|
||||||
Robin Green <greenrd@greenrd.org>
|
Robin Green <greenrd@greenrd.org>
|
||||||
|
|
2
LICENSE
2
LICENSE
|
@ -1,4 +1,4 @@
|
||||||
Copyright (c) 2010 Vladimir Panteleev and contributors (see the file AUTHORS
|
Copyright (c) 2010 Daniel Lenski and contributors (see the file AUTHORS
|
||||||
for a complete list)
|
for a complete list)
|
||||||
|
|
||||||
This software is provided 'as-is', without any express or implied
|
This software is provided 'as-is', without any express or implied
|
||||||
|
|
19
README.md
19
README.md
|
@ -1,27 +1,22 @@
|
||||||
wp2git
|
wp2git
|
||||||
======
|
======
|
||||||
|
|
||||||
This program allows you to download and convert any Wikipedia article's history to a git repository, for easy browsing and blaming.
|
This program allows you to download and convert any Wikipedia article's history to a `git` repository, for easy browsing and blaming.
|
||||||
|
|
||||||
### Usage
|
### Usage
|
||||||
|
|
||||||
$ wp2git Article_name_here
|
$ wp2git.py article_name
|
||||||
|
|
||||||
`wp2git` will create a directory, in which a new git repository will be created.
|
`wp2git` will create a directory, in which a new bare `git` repository will be created.
|
||||||
|
|
||||||
Run `wp2git --help` for more options.
|
Run `wp2git --help` for more options.
|
||||||
|
|
||||||
### Requirements
|
### Requirements
|
||||||
|
|
||||||
The commands `git` and `curl` should be accessible from `PATH`.
|
`git` should be accessible from `PATH`.
|
||||||
|
|
||||||
### Download
|
The [`mwclient` package](http://github.com/mwclient/mwclient) must be installed (use `pip install mwclient`).
|
||||||
|
|
||||||
You can find compiled Windows binaries on [files.thecybershadow.net](http://files.thecybershadow.net/wp2git/).
|
### Entirely based on
|
||||||
|
|
||||||
### Building
|
[CyberShadow's version](http://github.com/CyberShadow/wp2git) written in the D language.
|
||||||
|
|
||||||
You will need a [D compiler](http://dlang.org/download.html) to build `wp2git`.
|
|
||||||
|
|
||||||
$ git clone --recursive https://github.com/CyberShadow/wp2git
|
|
||||||
$ rdmd --build-only wp2git
|
|
||||||
|
|
1
ae
1
ae
|
@ -1 +0,0 @@
|
||||||
Subproject commit 196247891e84b6b30b54089d4a64163fad6807c3
|
|
111
wp2git.d
111
wp2git.d
|
@ -1,111 +0,0 @@
|
||||||
// Written in the D Programming Language (version 2)
|
|
||||||
|
|
||||||
import std.stdio;
|
|
||||||
import std.process;
|
|
||||||
import std.stream;
|
|
||||||
import std.string;
|
|
||||||
import std.file;
|
|
||||||
import std.conv;
|
|
||||||
import std.uri;
|
|
||||||
import std.getopt;
|
|
||||||
import std.exception;
|
|
||||||
|
|
||||||
import ae.utils.xmllite;
|
|
||||||
|
|
||||||
int main(string[] args)
|
|
||||||
{
|
|
||||||
string language="en";
|
|
||||||
bool usage, noImport, keepHistory;
|
|
||||||
getopt(args,
|
|
||||||
"h|help", &usage,
|
|
||||||
"keep-history", &keepHistory,
|
|
||||||
"no-import", &noImport,
|
|
||||||
"language", &language,
|
|
||||||
);
|
|
||||||
|
|
||||||
enforce(args.length<=2, "Multiple article name arguments");
|
|
||||||
if (args.length == 1 || usage)
|
|
||||||
{
|
|
||||||
stderr.writefln("Usage: %s Article_name [OPTION]...", args[0]);
|
|
||||||
stderr.writefln("Create a git repository with the history of the specified Wikipedia article.");
|
|
||||||
stderr.writefln("Supported options:");
|
|
||||||
stderr.writefln(" -h --help Display this help");
|
|
||||||
stderr.writefln(" --keep-history Don't delete history.xml");
|
|
||||||
stderr.writefln(" --no-import Don't invoke ``git fast-import'' and only");
|
|
||||||
stderr.writefln(" generate the fast-import data");
|
|
||||||
stderr.writefln(" --language LANG Specify the Wikipedia language subdomain (default: en)");
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
enforce(args.length==2, "No article specified");
|
|
||||||
auto name = args[1];
|
|
||||||
auto fn = sanitizeFn(name);
|
|
||||||
|
|
||||||
if (!exists(fn))
|
|
||||||
mkdir(fn);
|
|
||||||
chdir(fn);
|
|
||||||
|
|
||||||
if (!exists("history.xml"))
|
|
||||||
enforce(spawnvp(P_WAIT, "curl", ["curl", "-d", "\"\"", "http://" ~ language ~ ".wikipedia.org/w/index.php?title=Special:Export&pages=" ~ encodeComponent(name), "-o", "history.xml"])==0, "curl error");
|
|
||||||
|
|
||||||
stderr.writefln("Loading history...");
|
|
||||||
string xmldata = cast(string) read("history.xml");
|
|
||||||
if (!keepHistory) std.file.remove("history.xml");
|
|
||||||
auto xml = new XmlDocument(xmldata);
|
|
||||||
|
|
||||||
string data = "reset refs/heads/master\n";
|
|
||||||
auto page = xml[0]["page"];
|
|
||||||
enforce(page, "No such page");
|
|
||||||
foreach (child; page)
|
|
||||||
if (child.tag=="revision")
|
|
||||||
{
|
|
||||||
string id = child["id"].text;
|
|
||||||
string summary = child.findChild("comment") ? child["comment"].text : null;
|
|
||||||
string committer = child["contributor"].findChild("username") ? child["contributor"]["username"].text : child["contributor"]["ip"].text;
|
|
||||||
string text = child["text"].text;
|
|
||||||
stderr.writefln(" >> Revision %s by %s: %s", id, committer, summary);
|
|
||||||
|
|
||||||
summary ~= "\n\nhttp://" ~ language ~ ".wikipedia.org/w/index.php?oldid=" ~ id;
|
|
||||||
data ~=
|
|
||||||
"commit refs/heads/master\n" ~
|
|
||||||
"committer " ~ committer ~ " <" ~ committer ~ "@" ~ language ~ ".wikipedia.org> " ~ ISO8601toRFC2822(child["timestamp"].text) ~ "\n" ~
|
|
||||||
"data " ~ to!string(summary.length) ~ "\n" ~
|
|
||||||
summary ~ "\n" ~
|
|
||||||
"M 644 inline " ~ fn ~ ".mw\n" ~
|
|
||||||
"data " ~ to!string(text.length) ~ "\n" ~
|
|
||||||
text ~ "\n" ~
|
|
||||||
"\n";
|
|
||||||
}
|
|
||||||
std.file.write("fast-import-data", data);
|
|
||||||
|
|
||||||
if (noImport)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
enforce(!exists(".git"), "A git repository already exists here!");
|
|
||||||
|
|
||||||
system("git init");
|
|
||||||
system("git fast-import --date-format=rfc2822 < fast-import-data");
|
|
||||||
std.file.remove("fast-import-data");
|
|
||||||
system("git reset --hard");
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
string ISO8601toRFC2822(string s)
|
|
||||||
{
|
|
||||||
const monthNames = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"];
|
|
||||||
|
|
||||||
// 2010-06-15T19:28:44Z
|
|
||||||
// Feb 6 11:22:18 2007 -0500
|
|
||||||
return monthNames[.to!int(s[5..7])-1] ~ " " ~ s[8..10] ~ " " ~ s[11..13] ~ ":" ~ s[14..16] ~ ":" ~ s[17..19] ~ " " ~ s[0..4] ~ " +0000";
|
|
||||||
}
|
|
||||||
|
|
||||||
string sanitizeFn(string s)
|
|
||||||
{
|
|
||||||
static const string forbidden = `?*<>|:\/"`;
|
|
||||||
auto copy = s.dup;
|
|
||||||
foreach (ref c; copy)
|
|
||||||
if (forbidden.indexOf(c) >= 0)
|
|
||||||
c = '_';
|
|
||||||
return assumeUnique(copy);
|
|
||||||
}
|
|
|
@ -0,0 +1,88 @@
|
||||||
|
#!/usr/bin/env python2
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
from sys import stderr, stdout
|
||||||
|
import argparse
|
||||||
|
import mwclient
|
||||||
|
import subprocess as sp
|
||||||
|
import os, locale, time
|
||||||
|
|
||||||
|
lang = locale.getdefaultlocale()[0].split('_')[0] or ''
|
||||||
|
|
||||||
|
def sanitize(s):
|
||||||
|
forbidden = r'?*<>|:\/"'
|
||||||
|
for c in forbidden:
|
||||||
|
s = s.replace(c, '_')
|
||||||
|
return s
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
p = argparse.ArgumentParser(description='Create a git repository with the history of the specified Wikipedia article.')
|
||||||
|
p.add_argument('article_name')
|
||||||
|
p.add_argument('--no-import', dest='doimport', default=True, action='store_false',
|
||||||
|
help="Don't invoke git fast-import and only generate the fast-import data")
|
||||||
|
p.add_argument('-o','--outdir', help='Output directory')
|
||||||
|
g=p.add_mutually_exclusive_group()
|
||||||
|
g.add_argument('--lang', default=lang, help='Wikipedia language code (default %(default)s)')
|
||||||
|
g.add_argument('--site', help='Alternate site (e.g. commons.wikimedia.org)')
|
||||||
|
return p, p.parse_args()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
p, args = parse_args()
|
||||||
|
|
||||||
|
# Connect to site with mwclient
|
||||||
|
if args.site is not None:
|
||||||
|
s = args.site
|
||||||
|
elif args.lang is not None:
|
||||||
|
s = '%s.wikipedia.org' % args.lang
|
||||||
|
else:
|
||||||
|
s = 'wikipedia.org'
|
||||||
|
|
||||||
|
site = mwclient.Site(s)
|
||||||
|
print('Connected to site %s.' % s, file=stderr)
|
||||||
|
|
||||||
|
# Find the page
|
||||||
|
page = site.pages[args.article_name]
|
||||||
|
if not page.exists:
|
||||||
|
p.error('Page %s does not exist' % s)
|
||||||
|
|
||||||
|
# Create output directory
|
||||||
|
fn = sanitize(args.article_name)
|
||||||
|
if args.outdir is not None:
|
||||||
|
path = args.outdir
|
||||||
|
else:
|
||||||
|
path = fn
|
||||||
|
|
||||||
|
if os.path.exists(path):
|
||||||
|
p.error('Path %s exists' % path)
|
||||||
|
os.mkdir(path)
|
||||||
|
os.chdir(path)
|
||||||
|
|
||||||
|
# Create fast-import data stream
|
||||||
|
with open('fast-import-data', 'w+b') as fid:
|
||||||
|
fid.write('reset refs/heads/master\n')
|
||||||
|
for rev in page.revisions(dir='newer', prop='ids|timestamp|flags|comment|user|content'):
|
||||||
|
id = rev['revid']
|
||||||
|
text = rev.get('*','').encode('utf8')
|
||||||
|
committer = '%s@%s' % (rev['user'].encode('utf8'), site.host)
|
||||||
|
ts = time.mktime(rev['timestamp'])
|
||||||
|
print(" >> Revision %d by %s at %s: %s" % (id, rev['user'], rev['comment'], time.ctime(ts)), file=stderr)
|
||||||
|
|
||||||
|
summary = '%s\n\nURL: http://%s%sindex.php?oldid=%d' % (rev['comment'].encode('utf8') or '<blank>', site.host, site.path, id)
|
||||||
|
|
||||||
|
fid.write('commit refs/heads/master\n')
|
||||||
|
fid.write('committer <%s> %d +0000\n' % (committer, ts))
|
||||||
|
fid.write('data %d\n%s\n' % (len(summary), summary))
|
||||||
|
fid.write('M 644 inline %s.mw\n' % fn)
|
||||||
|
fid.write('data %d\n%s\n' % (len(text), text))
|
||||||
|
fid.write('done\n')
|
||||||
|
|
||||||
|
if args.doimport:
|
||||||
|
sp.check_call(['git','init','--bare'])
|
||||||
|
fid.seek(0, 0)
|
||||||
|
sp.check_call(['git', 'fast-import','--quiet'], stdin=fid)
|
||||||
|
|
||||||
|
if args.doimport:
|
||||||
|
os.unlink('fast-import-data')
|
||||||
|
|
||||||
|
if __name__=='__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue