Wikiwalker: Crawling wikipedia pages for images
Revision as of 15:58, 26 May 2014 by Michael Murtaugh (talk | contribs)
Step 1: Extracting infobox images from a wikipedia page
from __future__ import print_function
import urllib2, html5lib
from urlparse import urljoin
from xml.etree import ElementTree as ET
start = "http://en.wikipedia.org/wiki/J._D._Salinger"
todo = [start]
seen = set()
while len(todo) > 0:
url, todo = todo[0], todo[1:]
if url not in seen:
f = urllib2.urlopen(url)
print("VISITING", url)
src = f.read()
tree = html5lib.parse(src, namespaceHTMLElements=False)
h1 = tree.find(".//h1")
if h1 != None:
# print("title", ET.tostring(h1, method="text"))
print("title", ET.tostring(h1, method="html"))
for table in tree.findall(".//table"):
if "infobox" in table.get("class", "").split():
for img in table.findall(".//img"):
src = img.get("src", "")
src = urljoin(url, src)
print(src)