Wikiwalker: Crawling wikipedia pages for images
Step 1: Extracting infobox images from a wikipedia page
In this code, note the use of ElementTree's tostring function to convert a document element back into text. tostring has an optional method attibute with a number of interesting values: html and xml output html codes either loose (html) or strict (xml) the latter being useful if you want to feed the output into strict XML tools. Finally the text method outputs "text-only" effectively stripping any html tags, useful for when you want just the text.
from __future__ import print_function
import urllib2, html5lib
from urlparse import urljoin
from xml.etree import ElementTree as ET
start = "http://en.wikipedia.org/wiki/J._D._Salinger"
todo = [start]
seen = set()
while len(todo) > 0:
url, todo = todo[0], todo[1:]
if url not in seen:
f = urllib2.urlopen(url)
print("VISITING", url)
src = f.read()
tree = html5lib.parse(src, namespaceHTMLElements=False)
h1 = tree.find(".//h1")
if h1 != None:
# print("title", ET.tostring(h1, method="text"))
print("title", ET.tostring(h1, method="html"))
for table in tree.findall(".//table"):
if "infobox" in table.get("class", "").split():
for img in table.findall(".//img"):
src = img.get("src", "")
src = urljoin(url, src)
print(src)
Step 2: Crawling the links, outputting to an HTML file
from __future__ import print_function
import urllib2, html5lib, random
from urlparse import urljoin
from xml.etree import ElementTree as ET
start = "http://en.wikipedia.org/wiki/J._D._Salinger"
# start = sys.argv[1]
todo = [start]
seen = set()
htmloutput = open("wikiwalk.html", "w")
while len(todo) > 0:
url, todo = todo[0], todo[1:]
if url not in seen:
f = urllib2.urlopen(url)
print("VISITING", url)
src = f.read()
tree = html5lib.parse(src, namespaceHTMLElements=False)
h1 = tree.find(".//h1")
if h1 != None:
# print("title", ET.tostring(h1, method="text"))
print("title", ET.tostring(h1, method="html").strip().encode("utf-8"))
print(ET.tostring(h1, method="html").encode("utf-8"), file=htmloutput)
for table in tree.findall(".//table"):
if "infobox" in table.get("class", "").split():
for img in table.findall(".//img"):
src = img.get("src", "")
src = urljoin(url, src)
print("image", src)
print('<img src="' + src + '" />', file=htmloutput)
for div in tree.findall(".//div"):
if "mw-content-text" == div.get("id", ""):
# print ("found main div", div)
links = []
for a in div.findall(".//a"):
if not ("external" in a.get("class", "").split()):
href = a.get("href", "").strip()
linkurl = urljoin(url, href)
if href.startswith("/wiki"):
# print(ET.tostring(a))
linktitle = href.strip("/").split("/", 1)[1]
if not (':' in linktitle or '#' in linktitle):
# print(linktitle)
links.append(linkurl)
rlink = random.choice(links)
todo.append(rlink)
Output
See http://pzwart3.wdka.hro.nl/~mmurtaugh/wikiwalk.html
Puzzle
How to make only pages that have (infobox) images appear?