Wikipedia-Counter: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
No edit summary
 
Line 2: Line 2:


<source lang="python">
<source lang="python">
#!/usr/bin/python
import sys, json, urllib2, os
import re
from pprint import pprint
import html5lib, lxml, lxml.cssselect
import urllib2, datetime
def getCSS (url, selector):
    htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
    request = urllib2.Request(url)
    request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
    f=urllib2.urlopen(request)
    page = htmlparser.parse(f)
    selector = lxml.cssselect.CSSSelector(selector)
    return list(selector(page))
f=urllib2.urlopen("http://en.wikipedia.org/w/api.php?&action=query&titles=PAGE-TITLE&rvstartid=START-ID&prop=revisions&rvlimit=500&format=json")
data = json.load(f)
for r in data['query']['pages']["PAGE-ID"]["revisions"]:
    revid = r["revid"]
    parentid = r["parentid"]
    timestamp = r["timestamp"]
   
    print timestamp[0:4],timestamp[5:7],timestamp[8:10]
    dtt = datetime.date(int(timestamp[0:4]),int(timestamp[5:7]),int(timestamp[8:10]))
    #print dtt + datetime.timedelta(days=14)
    print revid
    print "**************************"
print "=============================="
   
firstday = data['query']['pages']["PAGE-ID"]["revisions"][0]
timestamp1 = firstday["timestamp"]
firstdate = datetime.date(int(timestamp1[0:4]),int(timestamp1[5:7]),int(timestamp1[8:10]))
lastday = data['query']['pages']["PAGE-ID"]["revisions"][499]
timestamp2 =lastday["timestamp"]
lastdate = datetime.date(int(timestamp2[0:4]),int(timestamp2[5:7]),int(timestamp2[8:10]))
currentdate = firstdate
while currentdate != (lastdate-datetime.timedelta(days=1)):
    print currentdate
    counter = 0
    for r in data['query']['pages']["PAGE-ID"]["revisions"]:
        timestamp = r["timestamp"]
        dtt = datetime.date(int(timestamp[0:4]),int(timestamp[5:7]),int(timestamp[8:10]))
        if (dtt==currentdate):
            counter = counter+1
    currentdate = currentdate-datetime.timedelta(days=1)
    print counter

Latest revision as of 13:21, 4 April 2012

A script that can count how many edits there have been at one day at one particular Wikipedia page. The script can only count 500 edits starting with the page of a date you can choose by adjust the ID of the page.

<source lang="python">

  1. !/usr/bin/python

import sys, json, urllib2, os import re from pprint import pprint import html5lib, lxml, lxml.cssselect import urllib2, datetime

def getCSS (url, selector):

   htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
   request = urllib2.Request(url)
   request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
   f=urllib2.urlopen(request)

   page = htmlparser.parse(f)
   selector = lxml.cssselect.CSSSelector(selector)
   return list(selector(page))


f=urllib2.urlopen("http://en.wikipedia.org/w/api.php?&action=query&titles=PAGE-TITLE&rvstartid=START-ID&prop=revisions&rvlimit=500&format=json")

data = json.load(f)

for r in data['query']['pages']["PAGE-ID"]["revisions"]:

   revid = r["revid"]
   parentid = r["parentid"]
   timestamp = r["timestamp"]
   
   print timestamp[0:4],timestamp[5:7],timestamp[8:10]
   dtt = datetime.date(int(timestamp[0:4]),int(timestamp[5:7]),int(timestamp[8:10]))
   #print dtt + datetime.timedelta(days=14) 
   print revid
   print "**************************"


print "=============================="

firstday = data['query']['pages']["PAGE-ID"]["revisions"][0] timestamp1 = firstday["timestamp"] firstdate = datetime.date(int(timestamp1[0:4]),int(timestamp1[5:7]),int(timestamp1[8:10]))

lastday = data['query']['pages']["PAGE-ID"]["revisions"][499]


timestamp2 =lastday["timestamp"] lastdate = datetime.date(int(timestamp2[0:4]),int(timestamp2[5:7]),int(timestamp2[8:10]))


currentdate = firstdate while currentdate != (lastdate-datetime.timedelta(days=1)):

   print currentdate
   counter = 0
   for r in data['query']['pages']["PAGE-ID"]["revisions"]:
       timestamp = r["timestamp"]
       dtt = datetime.date(int(timestamp[0:4]),int(timestamp[5:7]),int(timestamp[8:10]))
       if (dtt==currentdate):
           counter = counter+1
   currentdate = currentdate-datetime.timedelta(days=1)
   print counter