Wikiwalker: Crawling wikipedia pages for images: Difference between revisions
(Created page with "<source lang="python"> from __future__ import print_function import urllib2, html5lib from urlparse import urljoin from xml.etree import ElementTree as ET start = "http://en....") |
No edit summary |
||
Line 1: | Line 1: | ||
== Step 1: Extracting infobox images from a wikipedia page == | |||
<source lang="python"> | <source lang="python"> | ||
from __future__ import print_function | from __future__ import print_function |
Revision as of 15:58, 26 May 2014
Step 1: Extracting infobox images from a wikipedia page
from __future__ import print_function
import urllib2, html5lib
from urlparse import urljoin
from xml.etree import ElementTree as ET
start = "http://en.wikipedia.org/wiki/J._D._Salinger"
todo = [start]
seen = set()
while len(todo) > 0:
url, todo = todo[0], todo[1:]
if url not in seen:
f = urllib2.urlopen(url)
print("VISITING", url)
src = f.read()
tree = html5lib.parse(src, namespaceHTMLElements=False)
h1 = tree.find(".//h1")
if h1 != None:
# print("title", ET.tostring(h1, method="text"))
print("title", ET.tostring(h1, method="html"))
for table in tree.findall(".//table"):
if "infobox" in table.get("class", "").split():
for img in table.findall(".//img"):
src = img.get("src", "")
src = urljoin(url, src)
print(src)