Web Spider in Python: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
No edit summary
 
Line 4: Line 4:
import html5lib, urllib, urlparse
import html5lib, urllib, urlparse


url = "http://wikipedia.org/"
history = set()
html = urllib.urlopen(url).read()
todo = ["http://automatist.org/"]
tree = html5lib.parse(html, namespaceHTMLElements=False)
PREFIX = todo[0]
for a in tree.findall(".//a"):
 
    if a.attrib.get("href"):
while todo:
        href = urlparse.urljoin(url, a.attrib.get("href"))
    url = todo.pop()
        print href
    print url
    history.add(url)
    html = urllib.urlopen(url).read()
    tree = html5lib.parse(html, namespaceHTMLElements=False)
    for a in tree.findall(".//a"):
        if a.attrib.get("href"):
            href = urlparse.urljoin(url, a.attrib.get("href"))
            if href not in history and href.startswith(PREFIX):
                todo.append(href)
</source>
</source>

Latest revision as of 18:40, 4 March 2014

Using html5lib

import html5lib, urllib, urlparse

history = set()
todo = ["http://automatist.org/"]
PREFIX = todo[0]

while todo:
    url = todo.pop()
    print url
    history.add(url)
    html = urllib.urlopen(url).read()
    tree = html5lib.parse(html, namespaceHTMLElements=False)
    for a in tree.findall(".//a"):
        if a.attrib.get("href"):
            href = urlparse.urljoin(url, a.attrib.get("href"))
            if href not in history and href.startswith(PREFIX):
                todo.append(href)