Web Spider in Python
Revision as of 18:40, 4 March 2014 by Michael Murtaugh (talk | contribs)
Using html5lib
import html5lib, urllib, urlparse
history = set()
todo = ["http://automatist.org/"]
PREFIX = todo[0]
while todo:
url = todo.pop()
print url
history.add(url)
html = urllib.urlopen(url).read()
tree = html5lib.parse(html, namespaceHTMLElements=False)
for a in tree.findall(".//a"):
if a.attrib.get("href"):
href = urlparse.urljoin(url, a.attrib.get("href"))
if href not in history and href.startswith(PREFIX):
todo.append(href)