Simple Web Spider in Python: Difference between revisions

From XPUB & Lens-Based wiki
Line 99: Line 99:
from lxml.cssselect import CSSSelector
from lxml.cssselect import CSSSelector


useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
targets = ['http://www.volkskrant.nl/']
targets = ['http://www.volkskrant.nl/']


Line 107: Line 108:
     # try to open URL ...
     # try to open URL ...
     try:
     try:
         request = urllib2.Request(target)
         request = urllib2.Request(target, None, {'User-Agent': useragent})
        request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
         f=urllib2.urlopen(request)
         f=urllib2.urlopen(request)


Line 120: Line 120:
                 for link in CSSSelector('a[href]')(page):
                 for link in CSSSelector('a[href]')(page):
                     href = urlparse.urljoin(f.geturl(), link.attrib['href'])
                     href = urlparse.urljoin(f.geturl(), link.attrib['href'])
                     if href.split(':')[0] == 'http': # No js links
                     if href.split(':')[0] == 'http' and href != target: # No useless links
                         links.append(href)
                         links.append(href)
             if links: # Anything left?
             if links: # Anything left?
Line 126: Line 126:
                 for elt in CSSSelector('img[src]')(page):
                 for elt in CSSSelector('img[src]')(page):
                     href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
                     href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
                     remotefile = urllib2.urlopen(href)
                    request = urllib2.Request(href, None, {'User-Agent': useragent})
                     remotefile = urllib2.urlopen(request)
                     print 'downloading ' + href
                     print 'downloading ' + href
                     localfile = open('dump/'+href.split('/')[-1], "wb")
                     localfile = open('dump/'+href.split('/')[-1], "wb")
Line 132: Line 133:
                     localfile.close()
                     localfile.close()


         except ValueError, err:
        except IOError:
             print "Value Error:", err, target
            print "Ooops"
 
         except ValueError:
             print "Ooops"
 
        except AssertionError:
            print "Ooops"


     # ... catch HTTP and URL errors
     # ... catch HTTP and URL errors

Revision as of 15:12, 14 January 2011

Opening an network connection with urllib2

import urllib2

request = urllib2.Request("http://www.volkskrant.nl/")
f=urllib2.urlopen(request)

print f.geturl()
print f.info()
print f.read()

Some sites require that you set the "User-Agent" header.

import urllib2

request = urllib2.Request("http://www.volkskrant.nl/")
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)

Get the URL of all the links in a page and jump to a random page

import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector

targets = ['http://www.volkskrant.nl/']

while True:
    target = random.choice(targets)
    print '*** '+target+' ***'
    request = urllib2.Request(target)
    request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
    f=urllib2.urlopen(request)

    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
    page = parser.parse(f)

    targets = []

    for links in CSSSelector('a[href]')(page):
        href = urlparse.urljoin(f.geturl(), links.attrib['href'])
        if href.split(':')[0] == 'http':
            targets.append(href)

Get the URL of all the links in a page and jump to a random page AND BE SMART

import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector

targets = ['http://www.volkskrant.nl/']

while True:
    target = random.choice(targets)
    print '*** '+target+' ***'

    # try to open URL ...
    try:
        request = urllib2.Request(target)
        request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
        f=urllib2.urlopen(request)

        # Is it really something that I can parse? srsly?
        try:
            parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
            page = parser.parse(f)
        except ValueError, err:
            print "Value Error:", err, target

        links = []
        if CSSSelector('a[href]')(page): # Any links for me?
            for link in CSSSelector('a[href]')(page):
                href = urlparse.urljoin(f.geturl(), link.attrib['href'])
                if href.split(':')[0] == 'http': # No js links
                    links.append(href)
        if links: # Anything left?
            targets = links


    # ... catch HTTP and URL errors
    except urllib2.HTTPError, err:
        print "HTTP Error:",err.code , target
        print "trying other URL"
    except urllib2.URLError, err:
        print "URL Error:",err.reason , target
        print "trying other URL"

Same as above and grab all the pictures found on each page

import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector

useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
targets = ['http://www.volkskrant.nl/']

while True:
    target = random.choice(targets)
    print '*** '+target+' ***'

    # try to open URL ...
    try:
        request = urllib2.Request(target, None, {'User-Agent': useragent})
        f=urllib2.urlopen(request)

        # Is it really something that I can parse? srsly?
        try:
            parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
            page = parser.parse(f)

            links = []
            if CSSSelector('a[href]')(page): # Any links for me?
                for link in CSSSelector('a[href]')(page):
                    href = urlparse.urljoin(f.geturl(), link.attrib['href'])
                    if href.split(':')[0] == 'http' and href != target: # No useless links
                        links.append(href)
            if links: # Anything left?
                targets = links
                for elt in CSSSelector('img[src]')(page):
                    href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
                    request = urllib2.Request(href, None, {'User-Agent': useragent})
                    remotefile = urllib2.urlopen(request)
                    print 'downloading ' + href
                    localfile = open('dump/'+href.split('/')[-1], "wb")
                    localfile.write(remotefile.read())
                    localfile.close()

        except IOError:
            print "Ooops"

        except ValueError:
            print "Ooops"

        except AssertionError:
            print "Ooops"

    # ... catch HTTP and URL errors
    except urllib2.HTTPError, err:
        print "HTTP Error:",err.code , target
        print "trying other URL"
    except urllib2.URLError, err:
        print "URL Error:",err.reason , target
        print "trying other URL"