Wikipedia-Compare: Difference between revisions
Marie Wocher (talk | contribs) No edit summary |
Marie Wocher (talk | contribs) No edit summary |
||
(5 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
A script to compare successive Wikipedia history pages. It shows the content that has been edited from the previous page and the edit itself. | A script to compare successive Wikipedia history pages. It shows the content that has been edited from the previous page and the edit itself. The cript can run the 500 following edits starting with the page of a date you can choose by adjust the ID of the page. | ||
<source lang="python"> | <source lang="python"> | ||
#!/usr/bin/python | |||
import sys, json, urllib2, os | |||
import re | |||
from pprint import pprint | |||
import html5lib, lxml, lxml.cssselect | |||
import urllib2 | |||
def getCSS (url, selector): | |||
htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) | |||
request = urllib2.Request(url) | |||
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") | |||
f=urllib2.urlopen(request) | |||
page = htmlparser.parse(f) | |||
selector = lxml.cssselect.CSSSelector(selector) | |||
return list(selector(page)) | |||
f=urllib2.urlopen("http://en.wikipedia.org/w/api.php?&action=query&titles=PAGE-TITLE&rvstartid=START-ID&prop=revisions&rvlimit=500&format=json") | |||
data = json.load(f) | |||
def join (spanlist): | |||
hits = [] | |||
current = "" | |||
for n in spanlist: | |||
current += n.text | |||
if n.tail != None and len(n.tail.strip())>1: | |||
hits.append(current) | |||
current = "" | |||
else: | |||
if n.tail: | |||
current += n.tail | |||
return hits | |||
for r in data['query']['pages']["PAGE-ID"]["revisions"]: | |||
revid = r["revid"] | |||
parentid = r["parentid"] | |||
timestamp = r["timestamp"] | |||
url = "http://en.wikipedia.org/w/index.php?title=Whitney_Houston&diff="+str(revid)+"&oldid="+ str(parentid) | |||
print "----------------------------" | |||
print timestamp | |||
print parentid | |||
print revid | |||
spanlist = getCSS(url, ".diff-addedline span.diffchange-inline") | |||
print "added", join(spanlist) | |||
spanlist = getCSS(url, ".diff-deletedline span.diffchange-inline") | |||
print "deleted", join(spanlist) |
Latest revision as of 12:12, 4 April 2012
A script to compare successive Wikipedia history pages. It shows the content that has been edited from the previous page and the edit itself. The cript can run the 500 following edits starting with the page of a date you can choose by adjust the ID of the page.
<source lang="python">
- !/usr/bin/python
import sys, json, urllib2, os import re from pprint import pprint import html5lib, lxml, lxml.cssselect import urllib2
def getCSS (url, selector):
htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) request = urllib2.Request(url) request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") f=urllib2.urlopen(request) page = htmlparser.parse(f) selector = lxml.cssselect.CSSSelector(selector) return list(selector(page))
f=urllib2.urlopen("http://en.wikipedia.org/w/api.php?&action=query&titles=PAGE-TITLE&rvstartid=START-ID&prop=revisions&rvlimit=500&format=json")
data = json.load(f)
def join (spanlist):
hits = [] current = "" for n in spanlist: current += n.text
if n.tail != None and len(n.tail.strip())>1: hits.append(current) current = "" else: if n.tail: current += n.tail
return hits
for r in data['query']['pages']["PAGE-ID"]["revisions"]:
revid = r["revid"] parentid = r["parentid"] timestamp = r["timestamp"] url = "http://en.wikipedia.org/w/index.php?title=Whitney_Houston&diff="+str(revid)+"&oldid="+ str(parentid) print "----------------------------" print timestamp print parentid print revid
spanlist = getCSS(url, ".diff-addedline span.diffchange-inline") print "added", join(spanlist) spanlist = getCSS(url, ".diff-deletedline span.diffchange-inline") print "deleted", join(spanlist)