Wikipedia-Compare: Difference between revisions

Revision as of 13:10, 4 April 2012

A script to compare successive Wikipedia history pages. It shows the content that has been edited from the previous page and the edit itself. The cript can run the 500 following edits starting with the page of a date you can choose by adjust the ID of the page.

!/usr/bin/python

import sys, json, urllib2, os import re from pprint import pprint import html5lib, lxml, lxml.cssselect import urllib2

def getCSS (url, selector):

   htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
   request = urllib2.Request(url)
   request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
   f=urllib2.urlopen(request)

   page = htmlparser.parse(f)
   selector = lxml.cssselect.CSSSelector(selector)
   return list(selector(page))

f=urllib2.urlopen("http://en.wikipedia.org/w/api.php?&action=query&titles=Whitney_Houston&rvstartid=476544716&prop=revisions&rvlimit=600&format=json")

data = json.load(f)

def join (spanlist):

   hits = []
   current = ""
   for n in spanlist:
       current += n.text

       if n.tail != None and len(n.tail.strip())>1:
           hits.append(current)
           current = ""
       else:
           if n.tail:
               current += n.tail

   return hits

for r in data['query']['pages']["34071"]["revisions"]:

   revid = r["revid"]
   parentid = r["parentid"]
   timestamp = r["timestamp"]
   
   url = "http://en.wikipedia.org/w/index.php?title=Whitney_Houston&diff="+str(revid)+"&oldid="+ str(parentid)
   print "----------------------------"
   print timestamp
   print parentid
   print revid

   spanlist = getCSS(url, ".diff-addedline span.diffchange-inline")
   print "added", join(spanlist)
   spanlist = getCSS(url, ".diff-deletedline span.diffchange-inline")
   print "deleted", join(spanlist)

@@ Line 32: / Line 32: @@
      for n in spanlist:
          current += n.text
-#        is next not connected?:
          if n.tail != None and len(n.tail.strip())>1:
              hits.append(current)
@@ Line 40: / Line 40: @@
                  current += n.tail
-#                print lxml.etree.tostring(n)s
      return hits
@@ Line 54: / Line 53: @@
      print revid
-    #print "**************************"
-    #print lxml.etree.tostring(getCSS(url, "span.diffchange-inline")[1])
-    # diff-deletedline
-    # diff-addedline
      spanlist = getCSS(url, ".diff-addedline span.diffchange-inline")