more sophisticated site-parsing

This commit is contained in:
Daniel Lenski 2014-10-14 12:24:41 -07:00
parent aa22ced399
commit 73c7a7a052
1 changed files with 13 additions and 9 deletions

View File

@ -5,6 +5,7 @@ from sys import stderr, stdout
import argparse import argparse
import mwclient import mwclient
import subprocess as sp import subprocess as sp
import urlparse
import os, locale, time import os, locale, time
lang = locale.getdefaultlocale()[0].split('_')[0] or '' lang = locale.getdefaultlocale()[0].split('_')[0] or ''
@ -23,7 +24,7 @@ def parse_args():
p.add_argument('-o','--outdir', help='Output directory') p.add_argument('-o','--outdir', help='Output directory')
g=p.add_mutually_exclusive_group() g=p.add_mutually_exclusive_group()
g.add_argument('--lang', default=lang, help='Wikipedia language code (default %(default)s)') g.add_argument('--lang', default=lang, help='Wikipedia language code (default %(default)s)')
g.add_argument('--site', help='Alternate site (e.g. commons.wikimedia.org)') g.add_argument('--site', help='Alternate site (e.g. http://commons.wikimedia.org[/w/])')
return p, p.parse_args() return p, p.parse_args()
def main(): def main():
@ -31,14 +32,17 @@ def main():
# Connect to site with mwclient # Connect to site with mwclient
if args.site is not None: if args.site is not None:
s = args.site scheme, host, path = urlparse.urlparse(args.site, scheme='http')[:3]
if path=='':
path = '/w/'
elif not path.endswith('/'):
path += '/'
elif args.lang is not None: elif args.lang is not None:
s = '%s.wikipedia.org' % args.lang scheme, host, path = 'http', '%s.wikipedia.org' % args.lang, '/w/'
else: else:
s = 'wikipedia.org' scheme, host, path = 'http', 'wikipedia.org', '/w/'
site = mwclient.Site((scheme, host), path=path)
site = mwclient.Site(s) print('Connected to %s://%s%s' % (scheme, host, path), file=stderr)
print('Connected to site %s.' % s, file=stderr)
# Find the page # Find the page
page = site.pages[args.article_name] page = site.pages[args.article_name]
@ -63,11 +67,11 @@ def main():
for rev in page.revisions(dir='newer', prop='ids|timestamp|flags|comment|user|content'): for rev in page.revisions(dir='newer', prop='ids|timestamp|flags|comment|user|content'):
id = rev['revid'] id = rev['revid']
text = rev.get('*','').encode('utf8') text = rev.get('*','').encode('utf8')
committer = '%s@%s' % (rev['user'].encode('utf8'), site.host) committer = '%s@%s' % (rev['user'].encode('utf8'), site.host[1])
ts = time.mktime(rev['timestamp']) ts = time.mktime(rev['timestamp'])
print(" >> Revision %d by %s at %s: %s" % (id, rev['user'], rev['comment'], time.ctime(ts)), file=stderr) print(" >> Revision %d by %s at %s: %s" % (id, rev['user'], rev['comment'], time.ctime(ts)), file=stderr)
summary = '%s\n\nURL: http://%s%sindex.php?oldid=%d' % (rev['comment'].encode('utf8') or '<blank>', site.host, site.path, id) summary = '%s\n\nURL: %s://%s%sindex.php?oldid=%d' % (rev['comment'].encode('utf8') or '<blank>', site.host[0], site.host[1], site.path, id)
fid.write('commit refs/heads/master\n') fid.write('commit refs/heads/master\n')
fid.write('committer <%s> %d +0000\n' % (committer, ts)) fid.write('committer <%s> %d +0000\n' % (committer, ts))