Web Spider in Python: Difference between revisions
No edit summary |
No edit summary |
||
Line 4: | Line 4: | ||
import html5lib, urllib, urlparse | import html5lib, urllib, urlparse | ||
history = set() | |||
html = urllib.urlopen(url).read() | todo = ["http://automatist.org/"] | ||
tree = html5lib.parse(html, namespaceHTMLElements=False) | PREFIX = todo[0] | ||
for a in tree.findall(".//a"): | |||
while todo: | |||
url = todo.pop() | |||
print url | |||
history.add(url) | |||
html = urllib.urlopen(url).read() | |||
tree = html5lib.parse(html, namespaceHTMLElements=False) | |||
for a in tree.findall(".//a"): | |||
if a.attrib.get("href"): | |||
href = urlparse.urljoin(url, a.attrib.get("href")) | |||
if href not in history and href.startswith(PREFIX): | |||
todo.append(href) | |||
</source> | </source> |
Latest revision as of 18:40, 4 March 2014
Using html5lib
import html5lib, urllib, urlparse
history = set()
todo = ["http://automatist.org/"]
PREFIX = todo[0]
while todo:
url = todo.pop()
print url
history.add(url)
html = urllib.urlopen(url).read()
tree = html5lib.parse(html, namespaceHTMLElements=False)
for a in tree.findall(".//a"):
if a.attrib.get("href"):
href = urlparse.urljoin(url, a.attrib.get("href"))
if href not in history and href.startswith(PREFIX):
todo.append(href)