more sophisticated site-parsing
This commit is contained in:
parent
aa22ced399
commit
73c7a7a052
22
wp2git.py
22
wp2git.py
|
@ -5,6 +5,7 @@ from sys import stderr, stdout
|
||||||
import argparse
|
import argparse
|
||||||
import mwclient
|
import mwclient
|
||||||
import subprocess as sp
|
import subprocess as sp
|
||||||
|
import urlparse
|
||||||
import os, locale, time
|
import os, locale, time
|
||||||
|
|
||||||
lang = locale.getdefaultlocale()[0].split('_')[0] or ''
|
lang = locale.getdefaultlocale()[0].split('_')[0] or ''
|
||||||
|
@ -23,7 +24,7 @@ def parse_args():
|
||||||
p.add_argument('-o','--outdir', help='Output directory')
|
p.add_argument('-o','--outdir', help='Output directory')
|
||||||
g=p.add_mutually_exclusive_group()
|
g=p.add_mutually_exclusive_group()
|
||||||
g.add_argument('--lang', default=lang, help='Wikipedia language code (default %(default)s)')
|
g.add_argument('--lang', default=lang, help='Wikipedia language code (default %(default)s)')
|
||||||
g.add_argument('--site', help='Alternate site (e.g. commons.wikimedia.org)')
|
g.add_argument('--site', help='Alternate site (e.g. http://commons.wikimedia.org[/w/])')
|
||||||
return p, p.parse_args()
|
return p, p.parse_args()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -31,14 +32,17 @@ def main():
|
||||||
|
|
||||||
# Connect to site with mwclient
|
# Connect to site with mwclient
|
||||||
if args.site is not None:
|
if args.site is not None:
|
||||||
s = args.site
|
scheme, host, path = urlparse.urlparse(args.site, scheme='http')[:3]
|
||||||
|
if path=='':
|
||||||
|
path = '/w/'
|
||||||
|
elif not path.endswith('/'):
|
||||||
|
path += '/'
|
||||||
elif args.lang is not None:
|
elif args.lang is not None:
|
||||||
s = '%s.wikipedia.org' % args.lang
|
scheme, host, path = 'http', '%s.wikipedia.org' % args.lang, '/w/'
|
||||||
else:
|
else:
|
||||||
s = 'wikipedia.org'
|
scheme, host, path = 'http', 'wikipedia.org', '/w/'
|
||||||
|
site = mwclient.Site((scheme, host), path=path)
|
||||||
site = mwclient.Site(s)
|
print('Connected to %s://%s%s' % (scheme, host, path), file=stderr)
|
||||||
print('Connected to site %s.' % s, file=stderr)
|
|
||||||
|
|
||||||
# Find the page
|
# Find the page
|
||||||
page = site.pages[args.article_name]
|
page = site.pages[args.article_name]
|
||||||
|
@ -63,11 +67,11 @@ def main():
|
||||||
for rev in page.revisions(dir='newer', prop='ids|timestamp|flags|comment|user|content'):
|
for rev in page.revisions(dir='newer', prop='ids|timestamp|flags|comment|user|content'):
|
||||||
id = rev['revid']
|
id = rev['revid']
|
||||||
text = rev.get('*','').encode('utf8')
|
text = rev.get('*','').encode('utf8')
|
||||||
committer = '%s@%s' % (rev['user'].encode('utf8'), site.host)
|
committer = '%s@%s' % (rev['user'].encode('utf8'), site.host[1])
|
||||||
ts = time.mktime(rev['timestamp'])
|
ts = time.mktime(rev['timestamp'])
|
||||||
print(" >> Revision %d by %s at %s: %s" % (id, rev['user'], rev['comment'], time.ctime(ts)), file=stderr)
|
print(" >> Revision %d by %s at %s: %s" % (id, rev['user'], rev['comment'], time.ctime(ts)), file=stderr)
|
||||||
|
|
||||||
summary = '%s\n\nURL: http://%s%sindex.php?oldid=%d' % (rev['comment'].encode('utf8') or '<blank>', site.host, site.path, id)
|
summary = '%s\n\nURL: %s://%s%sindex.php?oldid=%d' % (rev['comment'].encode('utf8') or '<blank>', site.host[0], site.host[1], site.path, id)
|
||||||
|
|
||||||
fid.write('commit refs/heads/master\n')
|
fid.write('commit refs/heads/master\n')
|
||||||
fid.write('committer <%s> %d +0000\n' % (committer, ts))
|
fid.write('committer <%s> %d +0000\n' % (committer, ts))
|
||||||
|
|
Loading…
Reference in New Issue