Simple Web Spider in Python

From XPUB & Lens-Based wiki
Revision as of 21:12, 12 January 2011 by Aymeric Mansoux (talk | contribs)

Opening an network connection with urllib2

import urllib2

request = urllib2.Request("http://www.volkskrant.nl/")
f=urllib2.urlopen(request)

print f.geturl()
print f.info()
print f.read()

Some sites require that you set the "User-Agent" header.

import urllib2

request = urllib2.Request("http://www.volkskrant.nl/")
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)

Get the URL of all images on a page and download them in a folder

import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector

request = urllib2.Request("http://www.volkskrant.nl/")
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)

# f.geturl(), f.info()

parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = parser.parse(f)

for elt in CSSSelector('img[src]')(page):
    href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
    print 'downloading ' + href
    localfile = open('dump/'+href.split('/')[-1], "wb")
    localfile.write(remotefile.read())
    localfile.close()

Get the URL of all the links in a page and jump to a random page

import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector

targets = ['http://www.volkskrant.nl/']

while True:
    target = random.choice(targets)
    print '*** '+target+' ***'
    request = urllib2.Request(target)
    request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
    f=urllib2.urlopen(request)

    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
    page = parser.parse(f)

    targets = []

    for links in CSSSelector('a[href]')(page):
        href = urlparse.urljoin(f.geturl(), links.attrib['href'])
        if href.split(':')[0] == 'http':
            targets.append(href)