Web Spider in Python

From XPUB & Lens-Based wiki

Using html5lib

import html5lib, urllib, urlparse

history = set()
todo = ["http://automatist.org/"]
PREFIX = todo[0]

while todo:
    url = todo.pop()
    print url
    history.add(url)
    html = urllib.urlopen(url).read()
    tree = html5lib.parse(html, namespaceHTMLElements=False)
    for a in tree.findall(".//a"):
        if a.attrib.get("href"):
            href = urlparse.urljoin(url, a.attrib.get("href"))
            if href not in history and href.startswith(PREFIX):
                todo.append(href)