Web Spider in Python: Difference between revisions
(Created page with "Using html5lib <source lang="python"> import html5lib, urllib url = "http://wikipedia.org/" html = urllib.urlopen(url).read() tree = html5lib.parse(html, namespaceHTMLElemen...") |
No edit summary |
||
Line 2: | Line 2: | ||
<source lang="python"> | <source lang="python"> | ||
import html5lib, urllib | import html5lib, urllib, urlparse | ||
url = "http://wikipedia.org/" | url = "http://wikipedia.org/" | ||
Line 8: | Line 8: | ||
tree = html5lib.parse(html, namespaceHTMLElements=False) | tree = html5lib.parse(html, namespaceHTMLElements=False) | ||
for a in tree.findall(".//a"): | for a in tree.findall(".//a"): | ||
if a.attrib.get("href"): | |||
href = urlparse.urljoin(url, a.attrib.get("href")) | |||
print href | |||
</source> | </source> |
Revision as of 18:30, 4 March 2014
Using html5lib
import html5lib, urllib, urlparse
url = "http://wikipedia.org/"
html = urllib.urlopen(url).read()
tree = html5lib.parse(html, namespaceHTMLElements=False)
for a in tree.findall(".//a"):
if a.attrib.get("href"):
href = urlparse.urljoin(url, a.attrib.get("href"))
print href