Wikipedia-Counter: Difference between revisions
Marie Wocher (talk | contribs) (Created page with "A script that can count how many edits there have been at one day at one particular Wikipedia page. <source lang="python">") |
Marie Wocher (talk | contribs) No edit summary |
||
(One intermediate revision by the same user not shown) | |||
Line 1: | Line 1: | ||
A script that can count how many edits there have been at one day at one particular Wikipedia page. | A script that can count how many edits there have been at one day at one particular Wikipedia page. The script can only count 500 edits starting with the page of a date you can choose by adjust the ID of the page. | ||
<source lang="python"> | <source lang="python"> | ||
#!/usr/bin/python | |||
import sys, json, urllib2, os | |||
import re | |||
from pprint import pprint | |||
import html5lib, lxml, lxml.cssselect | |||
import urllib2, datetime | |||
def getCSS (url, selector): | |||
htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) | |||
request = urllib2.Request(url) | |||
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") | |||
f=urllib2.urlopen(request) | |||
page = htmlparser.parse(f) | |||
selector = lxml.cssselect.CSSSelector(selector) | |||
return list(selector(page)) | |||
f=urllib2.urlopen("http://en.wikipedia.org/w/api.php?&action=query&titles=PAGE-TITLE&rvstartid=START-ID&prop=revisions&rvlimit=500&format=json") | |||
data = json.load(f) | |||
for r in data['query']['pages']["PAGE-ID"]["revisions"]: | |||
revid = r["revid"] | |||
parentid = r["parentid"] | |||
timestamp = r["timestamp"] | |||
print timestamp[0:4],timestamp[5:7],timestamp[8:10] | |||
dtt = datetime.date(int(timestamp[0:4]),int(timestamp[5:7]),int(timestamp[8:10])) | |||
#print dtt + datetime.timedelta(days=14) | |||
print revid | |||
print "**************************" | |||
print "==============================" | |||
firstday = data['query']['pages']["PAGE-ID"]["revisions"][0] | |||
timestamp1 = firstday["timestamp"] | |||
firstdate = datetime.date(int(timestamp1[0:4]),int(timestamp1[5:7]),int(timestamp1[8:10])) | |||
lastday = data['query']['pages']["PAGE-ID"]["revisions"][499] | |||
timestamp2 =lastday["timestamp"] | |||
lastdate = datetime.date(int(timestamp2[0:4]),int(timestamp2[5:7]),int(timestamp2[8:10])) | |||
currentdate = firstdate | |||
while currentdate != (lastdate-datetime.timedelta(days=1)): | |||
print currentdate | |||
counter = 0 | |||
for r in data['query']['pages']["PAGE-ID"]["revisions"]: | |||
timestamp = r["timestamp"] | |||
dtt = datetime.date(int(timestamp[0:4]),int(timestamp[5:7]),int(timestamp[8:10])) | |||
if (dtt==currentdate): | |||
counter = counter+1 | |||
currentdate = currentdate-datetime.timedelta(days=1) | |||
print counter |
Latest revision as of 12:21, 4 April 2012
A script that can count how many edits there have been at one day at one particular Wikipedia page. The script can only count 500 edits starting with the page of a date you can choose by adjust the ID of the page.
<source lang="python">
- !/usr/bin/python
import sys, json, urllib2, os import re from pprint import pprint import html5lib, lxml, lxml.cssselect import urllib2, datetime
def getCSS (url, selector):
htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) request = urllib2.Request(url) request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") f=urllib2.urlopen(request) page = htmlparser.parse(f) selector = lxml.cssselect.CSSSelector(selector) return list(selector(page))
f=urllib2.urlopen("http://en.wikipedia.org/w/api.php?&action=query&titles=PAGE-TITLE&rvstartid=START-ID&prop=revisions&rvlimit=500&format=json")
data = json.load(f)
for r in data['query']['pages']["PAGE-ID"]["revisions"]:
revid = r["revid"] parentid = r["parentid"] timestamp = r["timestamp"] print timestamp[0:4],timestamp[5:7],timestamp[8:10] dtt = datetime.date(int(timestamp[0:4]),int(timestamp[5:7]),int(timestamp[8:10])) #print dtt + datetime.timedelta(days=14) print revid print "**************************"
print "=============================="
firstday = data['query']['pages']["PAGE-ID"]["revisions"][0] timestamp1 = firstday["timestamp"] firstdate = datetime.date(int(timestamp1[0:4]),int(timestamp1[5:7]),int(timestamp1[8:10]))
lastday = data['query']['pages']["PAGE-ID"]["revisions"][499]
timestamp2 =lastday["timestamp"]
lastdate = datetime.date(int(timestamp2[0:4]),int(timestamp2[5:7]),int(timestamp2[8:10]))
currentdate = firstdate
while currentdate != (lastdate-datetime.timedelta(days=1)):
print currentdate counter = 0 for r in data['query']['pages']["PAGE-ID"]["revisions"]:
timestamp = r["timestamp"] dtt = datetime.date(int(timestamp[0:4]),int(timestamp[5:7]),int(timestamp[8:10])) if (dtt==currentdate): counter = counter+1
currentdate = currentdate-datetime.timedelta(days=1) print counter