WebSpiders

From XPUB & Lens-Based wiki
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Custom web spider in Python

Code attached below!

Demonstrates Python's built-in urllib2, and urlparse modules. Uses the external python module html5lib for some robust HTML parsing (see link for installation instructions).

A "spider" typically follows the links it finds on a page, then repeats this process (as many times as desired). In this simple spider, the search is only one-level deep, for images (in HTML img tags), on an HTML page.

import urllib2, urlparse, os, sys
import html5lib
 
def absolutizeURL (href, base):
    if not href.lower().startswith("http://"):
        return urlparse.urljoin(base, href)
    return href

def openURL (url, data):
    """
    returns (page, actualurl)
    sets user_agent and resolves possible redirection
    realurl maybe different than url in the case of a redirect
    """    
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5"
    request.add_header("User-Agent", user_agent)
    pagefile=urllib2.urlopen(request, data)
    realurl = pagefile.geturl()
    return (pagefile, realurl)

def downloadURL (url, foldername=""):
    """
    returns (page, actualurl)
    sets user_agent and resolves possible redirection
    realurl maybe different than url in the case of a redirect
    """    
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
    request.add_header("User-Agent", user_agent)
    pagefile=urllib2.urlopen(request)
    realurl = pagefile.geturl()

    # make a filename based on the URL, inside foldername (if given)
    urlpath = urlparse.urlparse(url)[2]
    (path, filename) = os.path.split(urlpath)
    filename = os.path.join(foldername, filename)
    out = open(filename, "wb")
    bytes = 0
    while True:
        data = pagefile.read(1024)
        if not data: break
        bytes += len(data)
        out.write(data)
    out.write(data)
    pagefile.close()
    out.close()
    return bytes


def spiderImages (url, postdata=None, foldername=""):
    """
    Opens an HTML page at a URL, and downloads all the images (those referenced in <img> tags, images from css are not!)
    Files are put inside foldername if given.
    """
    if foldername and not os.path.isdir(foldername):
        os.mkdir(foldername)
    f, url2 = openURL(url, postdata)
    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
    tree = parser.parse(f)
    f.close()
    tree.normalize()
    count = 0; totalbytes = 0
    for node in tree.getElementsByTagName("img"):
        src = node.getAttribute("src")
        if src:
            src = absolutizeURL(src, url2)
            print >> sys.stderr, src
            bytes = downloadURL(src, foldername)
            if bytes:
                totalbytes += bytes
                count += 1
    return count, totalbytes

if __name__ == "__main__":
    spiderImages("http://automatist.org", None, "test")
    # spiderImages("http://www.ah.nl/previouslybought/PreviouslyBought.do", "cardNumber=2620480991698", "ah")


Attachments