Web Spider in Python

From XPUB & Lens-Based wiki

Using html5lib

import html5lib, urllib, urlparse

url = "http://wikipedia.org/"
html = urllib.urlopen(url).read()
tree = html5lib.parse(html, namespaceHTMLElements=False)
for a in tree.findall(".//a"):
    if a.attrib.get("href"):
        href = urlparse.urljoin(url, a.attrib.get("href"))
        print href