Wikiwalker: Crawling wikipedia pages for images

From XPUB & Lens-Based wiki

Step 1: Extracting infobox images from a wikipedia page

from __future__ import print_function
import urllib2, html5lib
from urlparse import urljoin
from xml.etree import ElementTree as ET

start = "http://en.wikipedia.org/wiki/J._D._Salinger"

todo = [start]
seen = set()

while len(todo) > 0:
    url, todo = todo[0], todo[1:]
    if url not in seen:
        f = urllib2.urlopen(url)
        print("VISITING", url)
        src = f.read()
        tree = html5lib.parse(src, namespaceHTMLElements=False)

        h1 = tree.find(".//h1")
        if h1 != None:
            # print("title", ET.tostring(h1, method="text"))
            print("title", ET.tostring(h1, method="html"))

        for table in tree.findall(".//table"):
            if "infobox" in table.get("class", "").split():
                for img in table.findall(".//img"):
                    src = img.get("src", "")
                    src = urljoin(url, src)
                    print(src)