Web Spider in Python
Revision as of 18:30, 4 March 2014 by Michael Murtaugh (talk | contribs)
Using html5lib
import html5lib, urllib, urlparse
url = "http://wikipedia.org/"
html = urllib.urlopen(url).read()
tree = html5lib.parse(html, namespaceHTMLElements=False)
for a in tree.findall(".//a"):
if a.attrib.get("href"):
href = urlparse.urljoin(url, a.attrib.get("href"))
print href