Wikipedia-Compare: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
No edit summary
Line 2: Line 2:


<source lang="python">
<source lang="python">
#!/usr/bin/python
import sys, json, urllib2, os
import re
from pprint import pprint
import html5lib, lxml, lxml.cssselect
import urllib2
def getCSS (url, selector):
    htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
    request = urllib2.Request(url)
    request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
    f=urllib2.urlopen(request)
    page = htmlparser.parse(f)
    selector = lxml.cssselect.CSSSelector(selector)
    return list(selector(page))
f=urllib2.urlopen("http://en.wikipedia.org/w/api.php?&action=query&titles=Whitney_Houston&rvstartid=476544716&prop=revisions&rvlimit=600&format=json")
data = json.load(f)
def join (spanlist):
    hits = []
    current = ""
    for n in spanlist:
        current += n.text
#        is next not connected?:
        if n.tail != None and len(n.tail.strip())>1:
            hits.append(current)
            current = ""
        else:
            if n.tail:
                current += n.tail
#                print lxml.etree.tostring(n)s
    return hits
for r in data['query']['pages']["34071"]["revisions"]:
    revid = r["revid"]
    parentid = r["parentid"]
    timestamp = r["timestamp"]
   
    url = "http://en.wikipedia.org/w/index.php?title=Whitney_Houston&diff="+str(revid)+"&oldid="+ str(parentid)
    print "----------------------------"
    print timestamp
    print parentid
    print revid
    #print "**************************"
    #print lxml.etree.tostring(getCSS(url, "span.diffchange-inline")[1])
    # diff-deletedline
    # diff-addedline
    spanlist = getCSS(url, ".diff-addedline span.diffchange-inline")
    print "added", join(spanlist)
    spanlist = getCSS(url, ".diff-deletedline span.diffchange-inline")
    print "deleted", join(spanlist)

Revision as of 12:08, 4 April 2012

A script to compare successive Wikipedia history pages. It shows the content that has been edited from the previous page and the edit itself. The cript can run the 500 following edits starting with the page of a date you can choose by adjust the ID of the page.

<source lang="python">


  1. !/usr/bin/python

import sys, json, urllib2, os import re from pprint import pprint import html5lib, lxml, lxml.cssselect import urllib2

def getCSS (url, selector):

   htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
   request = urllib2.Request(url)
   request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
   f=urllib2.urlopen(request)

   page = htmlparser.parse(f)
   selector = lxml.cssselect.CSSSelector(selector)
   return list(selector(page))


f=urllib2.urlopen("http://en.wikipedia.org/w/api.php?&action=query&titles=Whitney_Houston&rvstartid=476544716&prop=revisions&rvlimit=600&format=json")

data = json.load(f)

def join (spanlist):

   hits = []
   current = ""
   for n in spanlist:
       current += n.text
  1. is next not connected?:
       if n.tail != None and len(n.tail.strip())>1:
           hits.append(current)
           current = ""
       else:
           if n.tail:
               current += n.tail
  1. print lxml.etree.tostring(n)s
   return hits

for r in data['query']['pages']["34071"]["revisions"]:

   revid = r["revid"]
   parentid = r["parentid"]
   timestamp = r["timestamp"]
   
   url = "http://en.wikipedia.org/w/index.php?title=Whitney_Houston&diff="+str(revid)+"&oldid="+ str(parentid)
   print "----------------------------"
   print timestamp
   print parentid
   print revid
   #print "**************************"
   #print lxml.etree.tostring(getCSS(url, "span.diffchange-inline")[1])


   # diff-deletedline
   # diff-addedline
   spanlist = getCSS(url, ".diff-addedline span.diffchange-inline")
   print "added", join(spanlist)
   spanlist = getCSS(url, ".diff-deletedline span.diffchange-inline")
   print "deleted", join(spanlist)