Wikiwalker: Crawling wikipedia pages for images

Step 1: Extracting infobox images from a wikipedia page

In this code, note the use of ElementTree's tostring function to convert a document element back into text. tostring has an optional method attibute with a number of interesting values: html and xml output html codes either loose (html) or strict (xml) the latter being useful if you want to feed the output into strict XML tools. Finally the text method outputs "text-only" effectively stripping any html tags, useful for when you want just the text.

from __future__ import print_function
import urllib2, html5lib
from urlparse import urljoin
from xml.etree import ElementTree as ET

start = "http://en.wikipedia.org/wiki/J._D._Salinger"

todo = [start]
seen = set()

while len(todo) > 0:
    url, todo = todo[0], todo[1:]
    if url not in seen:
        f = urllib2.urlopen(url)
        print("VISITING", url)
        src = f.read()
        tree = html5lib.parse(src, namespaceHTMLElements=False)

        h1 = tree.find(".//h1")
        if h1 != None:
            # print("title", ET.tostring(h1, method="text"))
            print("title", ET.tostring(h1, method="html"))

        for table in tree.findall(".//table"):
            if "infobox" in table.get("class", "").split():
                for img in table.findall(".//img"):
                    src = img.get("src", "")
                    src = urljoin(url, src)
                    print(src)

Step 2: Crawling the links, outputting to an HTML file

from __future__ import print_function
import urllib2, html5lib, random
from urlparse import urljoin
from xml.etree import ElementTree as ET

start = "http://en.wikipedia.org/wiki/J._D._Salinger"
# start = sys.argv[1]

todo = [start]
seen = set()

htmloutput = open("wikiwalk.html", "w")

while len(todo) > 0:
    url, todo = todo[0], todo[1:]
    if url not in seen:
        f = urllib2.urlopen(url)
        print("VISITING", url)
        src = f.read()
        tree = html5lib.parse(src, namespaceHTMLElements=False)

        h1 = tree.find(".//h1")
        if h1 != None:
            # print("title", ET.tostring(h1, method="text"))
            print("title", ET.tostring(h1, method="html").strip().encode("utf-8"))
            print(ET.tostring(h1, method="html").encode("utf-8"), file=htmloutput)

        for table in tree.findall(".//table"):
            if "infobox" in table.get("class", "").split():
                for img in table.findall(".//img"):
                    src = img.get("src", "")
                    src = urljoin(url, src)
                    print("image", src)
                    print('<img src="' + src + '" />', file=htmloutput)

        for div in tree.findall(".//div"):
            if "mw-content-text" == div.get("id", ""):
                # print ("found main div", div)
                links = []
                for a in div.findall(".//a"):
                    if not ("external" in a.get("class", "").split()):
                        href = a.get("href", "").strip()
                        linkurl = urljoin(url, href)
                        if href.startswith("/wiki"):
                            # print(ET.tostring(a))
                            linktitle = href.strip("/").split("/", 1)[1]
                            if not (':' in linktitle or '#' in linktitle):
                                # print(linktitle)
                                links.append(linkurl)
                rlink = random.choice(links)
                todo.append(rlink)

Output

See walk 1, walk 2, walk 3

Puzzle

How to make only pages that have (infobox) images appear?