Web Spider in Python: Difference between revisions

From XPUB & Lens-Based wiki
(Created page with "Using html5lib <source lang="python"> import html5lib, urllib url = "http://wikipedia.org/" html = urllib.urlopen(url).read() tree = html5lib.parse(html, namespaceHTMLElemen...")
 
No edit summary
 
(One intermediate revision by the same user not shown)
Line 2: Line 2:


<source lang="python">
<source lang="python">
import html5lib, urllib
import html5lib, urllib, urlparse


url = "http://wikipedia.org/"
history = set()
html = urllib.urlopen(url).read()
todo = ["http://automatist.org/"]
tree = html5lib.parse(html, namespaceHTMLElements=False)
PREFIX = todo[0]
for a in tree.findall(".//a"):
 
    print "a element", a
while todo:
    url = todo.pop()
    print url
    history.add(url)
    html = urllib.urlopen(url).read()
    tree = html5lib.parse(html, namespaceHTMLElements=False)
    for a in tree.findall(".//a"):
        if a.attrib.get("href"):
            href = urlparse.urljoin(url, a.attrib.get("href"))
            if href not in history and href.startswith(PREFIX):
                todo.append(href)
</source>
</source>

Latest revision as of 18:40, 4 March 2014

Using html5lib

import html5lib, urllib, urlparse

history = set()
todo = ["http://automatist.org/"]
PREFIX = todo[0]

while todo:
    url = todo.pop()
    print url
    history.add(url)
    html = urllib.urlopen(url).read()
    tree = html5lib.parse(html, namespaceHTMLElements=False)
    for a in tree.findall(".//a"):
        if a.attrib.get("href"):
            href = urlparse.urljoin(url, a.attrib.get("href"))
            if href not in history and href.startswith(PREFIX):
                todo.append(href)