WebSpiders
Custom web spider in Python
Code attached below!
Demonstrates Python's built-in urllib2, and urlparse modules. Uses the external python module html5lib for some robust HTML parsing (see link for installation instructions).
A "spider" typically follows the links it finds on a page, then repeats this process (as many times as desired). In this simple spider, the search is only one-level deep, for images (in HTML img tags), on an HTML page.
import urllib2, urlparse, os, sys
import html5lib
def absolutizeURL (href, base):
if not href.lower().startswith("http://"):
return urlparse.urljoin(base, href)
return href
def openURL (url, data):
"""
returns (page, actualurl)
sets user_agent and resolves possible redirection
realurl maybe different than url in the case of a redirect
"""
request = urllib2.Request(url)
user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5"
request.add_header("User-Agent", user_agent)
pagefile=urllib2.urlopen(request, data)
realurl = pagefile.geturl()
return (pagefile, realurl)
def downloadURL (url, foldername=""):
"""
returns (page, actualurl)
sets user_agent and resolves possible redirection
realurl maybe different than url in the case of a redirect
"""
request = urllib2.Request(url)
user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
request.add_header("User-Agent", user_agent)
pagefile=urllib2.urlopen(request)
realurl = pagefile.geturl()
# make a filename based on the URL, inside foldername (if given)
urlpath = urlparse.urlparse(url)[2]
(path, filename) = os.path.split(urlpath)
filename = os.path.join(foldername, filename)
out = open(filename, "wb")
bytes = 0
while True:
data = pagefile.read(1024)
if not data: break
bytes += len(data)
out.write(data)
out.write(data)
pagefile.close()
out.close()
return bytes
def spiderImages (url, postdata=None, foldername=""):
"""
Opens an HTML page at a URL, and downloads all the images (those referenced in <img> tags, images from css are not!)
Files are put inside foldername if given.
"""
if foldername and not os.path.isdir(foldername):
os.mkdir(foldername)
f, url2 = openURL(url, postdata)
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
tree = parser.parse(f)
f.close()
tree.normalize()
count = 0; totalbytes = 0
for node in tree.getElementsByTagName("img"):
src = node.getAttribute("src")
if src:
src = absolutizeURL(src, url2)
print >> sys.stderr, src
bytes = downloadURL(src, foldername)
if bytes:
totalbytes += bytes
count += 1
return count, totalbytes
if __name__ == "__main__":
spiderImages("http://automatist.org", None, "test")
# spiderImages("http://www.ah.nl/previouslybought/PreviouslyBought.do", "cardNumber=2620480991698", "ah")