Web Spider in Python

From XPUB & Lens-Based wiki
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Using html5lib

import html5lib, urllib, urlparse

history = set()
todo = ["http://automatist.org/"]
PREFIX = todo[0]

while todo:
    url = todo.pop()
    print url
    history.add(url)
    html = urllib.urlopen(url).read()
    tree = html5lib.parse(html, namespaceHTMLElements=False)
    for a in tree.findall(".//a"):
        if a.attrib.get("href"):
            href = urlparse.urljoin(url, a.attrib.get("href"))
            if href not in history and href.startswith(PREFIX):
                todo.append(href)