Wikiwalker: Crawling wikipedia pages for images

From XPUB & Lens-Based wiki
Revision as of 16:57, 26 May 2014 by Michael Murtaugh (talk | contribs) (Created page with "<source lang="python"> from __future__ import print_function import urllib2, html5lib from urlparse import urljoin from xml.etree import ElementTree as ET start = "http://en....")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
from __future__ import print_function
import urllib2, html5lib
from urlparse import urljoin
from xml.etree import ElementTree as ET

start = "http://en.wikipedia.org/wiki/J._D._Salinger"

todo = [start]
seen = set()

while len(todo) > 0:
    url, todo = todo[0], todo[1:]
    if url not in seen:
        f = urllib2.urlopen(url)
        print("VISITING", url)
        src = f.read()
        tree = html5lib.parse(src, namespaceHTMLElements=False)

        h1 = tree.find(".//h1")
        if h1 != None:
            # print("title", ET.tostring(h1, method="text"))
            print("title", ET.tostring(h1, method="html"))

        for table in tree.findall(".//table"):
            if "infobox" in table.get("class", "").split():
                for img in table.findall(".//img"):
                    src = img.get("src", "")
                    src = urljoin(url, src)
                    print(src)