WebSpiders

From XPUB & Lens-Based wiki
Revision as of 21:29, 23 September 2010 by Migratebot (talk | contribs) (Created page with "== Custom web spider in Python == Code attached below! Demonstrates Python's built-in urllib2, and urlparse modules. Uses the external python module [http://code.google.com...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

Custom web spider in Python

Code attached below!

Demonstrates Python's built-in urllib2, and urlparse modules. Uses the external python module html5lib for some robust HTML parsing (see link for installation instructions).

A "spider" typically follows the links it finds on a page, then repeats this process (as many times as desired). In this simple spider, the search is only one-level deep, for images (in HTML img tags), on an HTML page.

import urllib2, urlparse, os, sys
import html5lib
 
def absolutizeURL (href, base):
    if not href.lower().startswith("http://"):
        return urlparse.urljoin(base, href)
    return href

def openURL (url, data):
    """
    returns (page, actualurl)
    sets user_agent and resolves possible redirection
    realurl maybe different than url in the case of a redirect
    """    
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5"
    request.add_header("User-Agent", user_agent)
    pagefile=urllib2.urlopen(request, data)
    realurl = pagefile.geturl()
    return (pagefile, realurl)

def downloadURL (url, foldername=""):
    """
    returns (page, actualurl)
    sets user_agent and resolves possible redirection
    realurl maybe different than url in the case of a redirect
    """    
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
    request.add_header("User-Agent", user_agent)
    pagefile=urllib2.urlopen(request)
    realurl = pagefile.geturl()

    # make a filename based on the URL, inside foldername (if given)
    urlpath = urlparse.urlparse(url)[2]
    (path, filename) = os.path.split(urlpath)
    filename = os.path.join(foldername, filename)
    out = open(filename, "wb")
    bytes = 0
    while True:
        data = pagefile.read(1024)
        if not data: break
        bytes += len(data)
        out.write(data)
    out.write(data)
    pagefile.close()
    out.close()
    return bytes


def spiderImages (url, postdata=None, foldername=""):
    """
    Opens an HTML page at a URL, and downloads all the images (those referenced in <img> tags, images from css are not!)
    Files are put inside foldername if given.
    """
    if foldername and not os.path.isdir(foldername):
        os.mkdir(foldername)
    f, url2 = openURL(url, postdata)
    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
    tree = parser.parse(f)
    f.close()
    tree.normalize()
    count = 0; totalbytes = 0
    for node in tree.getElementsByTagName("img"):
        src = node.getAttribute("src")
        if src:
            src = absolutizeURL(src, url2)
            print >> sys.stderr, src
            bytes = downloadURL(src, foldername)
            if bytes:
                totalbytes += bytes
                count += 1
    return count, totalbytes

if __name__ == "__main__":
    spiderImages("http://automatist.org", None, "test")
    # spiderImages("http://www.ah.nl/previouslybought/PreviouslyBought.do", "cardNumber=2620480991698", "ah")


Attachments