Wikiwalker: Crawling wikipedia pages for images: Difference between revisions

From XPUB & Lens-Based wiki
(Created page with "<source lang="python"> from __future__ import print_function import urllib2, html5lib from urlparse import urljoin from xml.etree import ElementTree as ET start = "http://en....")
 
No edit summary
Line 1: Line 1:
== Step 1: Extracting infobox images from a wikipedia page ==
<source lang="python">
<source lang="python">
from __future__ import print_function
from __future__ import print_function

Revision as of 15:58, 26 May 2014

Step 1: Extracting infobox images from a wikipedia page

from __future__ import print_function
import urllib2, html5lib
from urlparse import urljoin
from xml.etree import ElementTree as ET

start = "http://en.wikipedia.org/wiki/J._D._Salinger"

todo = [start]
seen = set()

while len(todo) > 0:
    url, todo = todo[0], todo[1:]
    if url not in seen:
        f = urllib2.urlopen(url)
        print("VISITING", url)
        src = f.read()
        tree = html5lib.parse(src, namespaceHTMLElements=False)

        h1 = tree.find(".//h1")
        if h1 != None:
            # print("title", ET.tostring(h1, method="text"))
            print("title", ET.tostring(h1, method="html"))

        for table in tree.findall(".//table"):
            if "infobox" in table.get("class", "").split():
                for img in table.findall(".//img"):
                    src = img.get("src", "")
                    src = urljoin(url, src)
                    print(src)