Web Spider in Python: Difference between revisions

From XPUB & Lens-Based wiki
(Created page with "Using html5lib <source lang="python"> import html5lib, urllib url = "http://wikipedia.org/" html = urllib.urlopen(url).read() tree = html5lib.parse(html, namespaceHTMLElemen...")
 
No edit summary
Line 2: Line 2:


<source lang="python">
<source lang="python">
import html5lib, urllib
import html5lib, urllib, urlparse


url = "http://wikipedia.org/"
url = "http://wikipedia.org/"
Line 8: Line 8:
tree = html5lib.parse(html, namespaceHTMLElements=False)
tree = html5lib.parse(html, namespaceHTMLElements=False)
for a in tree.findall(".//a"):
for a in tree.findall(".//a"):
     print "a element", a
     if a.attrib.get("href"):
        href = urlparse.urljoin(url, a.attrib.get("href"))
        print href
</source>
</source>

Revision as of 19:30, 4 March 2014

Using html5lib

import html5lib, urllib, urlparse

url = "http://wikipedia.org/"
html = urllib.urlopen(url).read()
tree = html5lib.parse(html, namespaceHTMLElements=False)
for a in tree.findall(".//a"):
    if a.attrib.get("href"):
        href = urlparse.urljoin(url, a.attrib.get("href"))
        print href