Wikiwalker: Crawling wikipedia pages for images: Difference between revisions
(→Output) |
|||
(6 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
== Step 1: Extracting infobox images from a wikipedia page == | == Step 1: Extracting infobox images from a wikipedia page == | ||
In this code, note the use of ElementTree's ''tostring'' function to convert a document element back into text. ''tostring'' has an optional method attibute with a number of interesting values: ''html'' and ''xml'' output html codes either loose (html) or strict (xml) the latter being useful if you want to feed the output into strict XML tools. Finally the ''text'' method outputs "text-only" effectively stripping any html tags, useful for when you want just the text. | In this code, note the use of ElementTree's ''[http://docs.python.org/2/library/xml.etree.elementtree.html#xml.etree.ElementTree.tostring tostring]'' function to convert a document element back into text. ''tostring'' has an optional method attibute with a number of interesting values: ''html'' and ''xml'' output html codes either loose (html) or strict (xml) the latter being useful if you want to feed the output into strict XML tools. Finally the ''text'' method outputs "text-only" effectively stripping any html tags, useful for when you want just the text. | ||
<source lang="python"> | <source lang="python"> | ||
Line 34: | Line 34: | ||
print(src) | print(src) | ||
</source> | </source> | ||
== Step 2: Crawling the links, outputting to an HTML file == | |||
<source lang="python"> | |||
from __future__ import print_function | |||
import urllib2, html5lib, random | |||
from urlparse import urljoin | |||
from xml.etree import ElementTree as ET | |||
start = "http://en.wikipedia.org/wiki/J._D._Salinger" | |||
# start = sys.argv[1] | |||
todo = [start] | |||
seen = set() | |||
htmloutput = open("wikiwalk.html", "w") | |||
while len(todo) > 0: | |||
url, todo = todo[0], todo[1:] | |||
if url not in seen: | |||
f = urllib2.urlopen(url) | |||
print("VISITING", url) | |||
src = f.read() | |||
tree = html5lib.parse(src, namespaceHTMLElements=False) | |||
h1 = tree.find(".//h1") | |||
if h1 != None: | |||
# print("title", ET.tostring(h1, method="text")) | |||
print("title", ET.tostring(h1, method="html").strip().encode("utf-8")) | |||
print(ET.tostring(h1, method="html").encode("utf-8"), file=htmloutput) | |||
for table in tree.findall(".//table"): | |||
if "infobox" in table.get("class", "").split(): | |||
for img in table.findall(".//img"): | |||
src = img.get("src", "") | |||
src = urljoin(url, src) | |||
print("image", src) | |||
print('<img src="' + src + '" />', file=htmloutput) | |||
for div in tree.findall(".//div"): | |||
if "mw-content-text" == div.get("id", ""): | |||
# print ("found main div", div) | |||
links = [] | |||
for a in div.findall(".//a"): | |||
if not ("external" in a.get("class", "").split()): | |||
href = a.get("href", "").strip() | |||
linkurl = urljoin(url, href) | |||
if href.startswith("/wiki"): | |||
# print(ET.tostring(a)) | |||
linktitle = href.strip("/").split("/", 1)[1] | |||
if not (':' in linktitle or '#' in linktitle): | |||
# print(linktitle) | |||
links.append(linkurl) | |||
rlink = random.choice(links) | |||
todo.append(rlink) | |||
</source> | |||
== Output == | |||
See [http://pzwart3.wdka.hro.nl/~mmurtaugh/wikiwalk.html walk 1], [http://pzwart3.wdka.hro.nl/~mmurtaugh/wikiwalk02.html walk 2], [http://pzwart3.wdka.hro.nl/~mmurtaugh/wikiwalk03.html walk 3] | |||
== Puzzle == | |||
How to make only pages that have (infobox) images appear? |
Latest revision as of 16:29, 26 May 2014
Step 1: Extracting infobox images from a wikipedia page
In this code, note the use of ElementTree's tostring function to convert a document element back into text. tostring has an optional method attibute with a number of interesting values: html and xml output html codes either loose (html) or strict (xml) the latter being useful if you want to feed the output into strict XML tools. Finally the text method outputs "text-only" effectively stripping any html tags, useful for when you want just the text.
from __future__ import print_function
import urllib2, html5lib
from urlparse import urljoin
from xml.etree import ElementTree as ET
start = "http://en.wikipedia.org/wiki/J._D._Salinger"
todo = [start]
seen = set()
while len(todo) > 0:
url, todo = todo[0], todo[1:]
if url not in seen:
f = urllib2.urlopen(url)
print("VISITING", url)
src = f.read()
tree = html5lib.parse(src, namespaceHTMLElements=False)
h1 = tree.find(".//h1")
if h1 != None:
# print("title", ET.tostring(h1, method="text"))
print("title", ET.tostring(h1, method="html"))
for table in tree.findall(".//table"):
if "infobox" in table.get("class", "").split():
for img in table.findall(".//img"):
src = img.get("src", "")
src = urljoin(url, src)
print(src)
Step 2: Crawling the links, outputting to an HTML file
from __future__ import print_function
import urllib2, html5lib, random
from urlparse import urljoin
from xml.etree import ElementTree as ET
start = "http://en.wikipedia.org/wiki/J._D._Salinger"
# start = sys.argv[1]
todo = [start]
seen = set()
htmloutput = open("wikiwalk.html", "w")
while len(todo) > 0:
url, todo = todo[0], todo[1:]
if url not in seen:
f = urllib2.urlopen(url)
print("VISITING", url)
src = f.read()
tree = html5lib.parse(src, namespaceHTMLElements=False)
h1 = tree.find(".//h1")
if h1 != None:
# print("title", ET.tostring(h1, method="text"))
print("title", ET.tostring(h1, method="html").strip().encode("utf-8"))
print(ET.tostring(h1, method="html").encode("utf-8"), file=htmloutput)
for table in tree.findall(".//table"):
if "infobox" in table.get("class", "").split():
for img in table.findall(".//img"):
src = img.get("src", "")
src = urljoin(url, src)
print("image", src)
print('<img src="' + src + '" />', file=htmloutput)
for div in tree.findall(".//div"):
if "mw-content-text" == div.get("id", ""):
# print ("found main div", div)
links = []
for a in div.findall(".//a"):
if not ("external" in a.get("class", "").split()):
href = a.get("href", "").strip()
linkurl = urljoin(url, href)
if href.startswith("/wiki"):
# print(ET.tostring(a))
linktitle = href.strip("/").split("/", 1)[1]
if not (':' in linktitle or '#' in linktitle):
# print(linktitle)
links.append(linkurl)
rlink = random.choice(links)
todo.append(rlink)
Output
Puzzle
How to make only pages that have (infobox) images appear?