Simple Web Spider in Python: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
No edit summary
 
(6 intermediate revisions by one other user not shown)
Line 1: Line 1:
A web spider that starts at a given URL, and follows links
== Opening an network connection with urllib2 ==
== Opening an network connection with urllib2 ==


Line 22: Line 24:
</source>
</source>


== Get the URL of all images on a page and download them in a folder ==
== Get the URL of all the links in a page and jump to a random page ==


<source lang="python">
<source lang="python">
import random
import urllib2, urlparse, html5lib, lxml
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector
from lxml.cssselect import CSSSelector


request = urllib2.Request("http://www.volkskrant.nl/")
targets = ['http://www.volkskrant.nl/']
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)


# f.geturl(), f.info()
while True:
    target = random.choice(targets)
    print '*** '+target+' ***'
    request = urllib2.Request(target)
    request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
    f=urllib2.urlopen(request)


parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = parser.parse(f)
    page = parser.parse(f)


for elt in CSSSelector('img[src]')(page):
     targets = []
     href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
    print 'downloading ' + href
    localfile = open('dump/'+href.split('/')[-1], "wb")
    localfile.write(remotefile.read())
    localfile.close()


    for links in CSSSelector('a[href]')(page):
        href = urlparse.urljoin(f.geturl(), links.attrib['href'])
        if href.split(':')[0] == 'http':
            targets.append(href)
</source>
</source>


== Get the URL of all the links in a page and jump to a random page ==
== Get the URL of all the links in a page and jump to a random page AND BE SMART ==
<source lang="python">
import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector
 
targets = ['http://www.volkskrant.nl/']
 
while True:
    target = random.choice(targets)
    print '*** '+target+' ***'
 
    # try to open URL ...
    try:
        request = urllib2.Request(target)
        request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
        f=urllib2.urlopen(request)
 
        # Is it really something that I can parse? srsly?
        try:
            parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
            page = parser.parse(f)
        except ValueError, err:
            print "Value Error:", err, target
 
        links = []
        if CSSSelector('a[href]')(page): # Any links for me?
            for link in CSSSelector('a[href]')(page):
                href = urlparse.urljoin(f.geturl(), link.attrib['href'])
                if href.split(':')[0] == 'http': # No js links
                    links.append(href)
        if links: # Anything left?
            targets = links
 
 
    # ... catch HTTP and URL errors
    except urllib2.HTTPError, err:
        print "HTTP Error:",err.code , target
        print "trying other URL"
    except urllib2.URLError, err:
        print "URL Error:",err.reason , target
        print "trying other URL"
</source>


== Same as above and grab all the pictures found on each page AND BE SMARTER==
<source lang="python">
<source lang="python">
import random
import random
Line 53: Line 101:
from lxml.cssselect import CSSSelector
from lxml.cssselect import CSSSelector


useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
targets = ['http://www.volkskrant.nl/']
targets = ['http://www.volkskrant.nl/']


Line 58: Line 107:
     target = random.choice(targets)
     target = random.choice(targets)
     print '*** '+target+' ***'
     print '*** '+target+' ***'
    request = urllib2.Request(target)
    request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
    f=urllib2.urlopen(request)


     parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
     # try to open URL ...
    page = parser.parse(f)
    try:
        request = urllib2.Request(target, None, {'User-Agent': useragent})
        f=urllib2.urlopen(request)
 
        # Is it really something that I can parse? srsly?
        try:
            parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
            page = parser.parse(f)
 
            links = []
            if CSSSelector('a[href]')(page): # Any links for me?
                for link in CSSSelector('a[href]')(page):
                    href = urlparse.urljoin(f.geturl(), link.attrib['href'])
                    if href.split(':')[0] == 'http' and href != target: # No useless links
                        links.append(href)
            if links: # Anything left?
                targets = links
                for elt in CSSSelector('img[src]')(page):
                    href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
                    request = urllib2.Request(href, None, {'User-Agent': useragent})
                    remotefile = urllib2.urlopen(request)
                    print 'downloading ' + href
                    localfile = open('dump/'+href.split('/')[-1], "wb")
                    localfile.write(remotefile.read())
                    localfile.close()
 
        except IOError:
            print "Ooops"
 
        except ValueError:
            print "Ooops"


    targets = []
        except AssertionError:
            print "Ooops"


     for links in CSSSelector('a[href]')(page):
     # ... catch HTTP and URL errors
         href = urlparse.urljoin(f.geturl(), links.attrib['href'])
    except urllib2.HTTPError, err:
         if href.split(':')[0] == 'http':
         print "HTTP Error:",err.code , target
            targets.append(href)
         print "trying other URL"
    except urllib2.URLError, err:
        print "URL Error:",err.reason , target
        print "trying other URL"
</source>
</source>


[[Category:Cookbook]]
[[Category:Cookbook]]

Latest revision as of 10:51, 2 December 2013

A web spider that starts at a given URL, and follows links

Opening an network connection with urllib2

import urllib2

request = urllib2.Request("http://www.volkskrant.nl/")
f=urllib2.urlopen(request)

print f.geturl()
print f.info()
print f.read()

Some sites require that you set the "User-Agent" header.

import urllib2

request = urllib2.Request("http://www.volkskrant.nl/")
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)

Get the URL of all the links in a page and jump to a random page

import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector

targets = ['http://www.volkskrant.nl/']

while True:
    target = random.choice(targets)
    print '*** '+target+' ***'
    request = urllib2.Request(target)
    request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
    f=urllib2.urlopen(request)

    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
    page = parser.parse(f)

    targets = []

    for links in CSSSelector('a[href]')(page):
        href = urlparse.urljoin(f.geturl(), links.attrib['href'])
        if href.split(':')[0] == 'http':
            targets.append(href)

Get the URL of all the links in a page and jump to a random page AND BE SMART

import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector

targets = ['http://www.volkskrant.nl/']

while True:
    target = random.choice(targets)
    print '*** '+target+' ***'

    # try to open URL ...
    try:
        request = urllib2.Request(target)
        request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
        f=urllib2.urlopen(request)

        # Is it really something that I can parse? srsly?
        try:
            parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
            page = parser.parse(f)
        except ValueError, err:
            print "Value Error:", err, target

        links = []
        if CSSSelector('a[href]')(page): # Any links for me?
            for link in CSSSelector('a[href]')(page):
                href = urlparse.urljoin(f.geturl(), link.attrib['href'])
                if href.split(':')[0] == 'http': # No js links
                    links.append(href)
        if links: # Anything left?
            targets = links


    # ... catch HTTP and URL errors
    except urllib2.HTTPError, err:
        print "HTTP Error:",err.code , target
        print "trying other URL"
    except urllib2.URLError, err:
        print "URL Error:",err.reason , target
        print "trying other URL"

Same as above and grab all the pictures found on each page AND BE SMARTER

import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector

useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
targets = ['http://www.volkskrant.nl/']

while True:
    target = random.choice(targets)
    print '*** '+target+' ***'

    # try to open URL ...
    try:
        request = urllib2.Request(target, None, {'User-Agent': useragent})
        f=urllib2.urlopen(request)

        # Is it really something that I can parse? srsly?
        try:
            parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
            page = parser.parse(f)

            links = []
            if CSSSelector('a[href]')(page): # Any links for me?
                for link in CSSSelector('a[href]')(page):
                    href = urlparse.urljoin(f.geturl(), link.attrib['href'])
                    if href.split(':')[0] == 'http' and href != target: # No useless links
                        links.append(href)
            if links: # Anything left?
                targets = links
                for elt in CSSSelector('img[src]')(page):
                    href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
                    request = urllib2.Request(href, None, {'User-Agent': useragent})
                    remotefile = urllib2.urlopen(request)
                    print 'downloading ' + href
                    localfile = open('dump/'+href.split('/')[-1], "wb")
                    localfile.write(remotefile.read())
                    localfile.close()

        except IOError:
            print "Ooops"

        except ValueError:
            print "Ooops"

        except AssertionError:
            print "Ooops"

    # ... catch HTTP and URL errors
    except urllib2.HTTPError, err:
        print "HTTP Error:",err.code , target
        print "trying other URL"
    except urllib2.URLError, err:
        print "URL Error:",err.reason , target
        print "trying other URL"