Simple Web Spider in Python: Difference between revisions
Line 99: | Line 99: | ||
from lxml.cssselect import CSSSelector | from lxml.cssselect import CSSSelector | ||
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101" | |||
targets = ['http://www.volkskrant.nl/'] | targets = ['http://www.volkskrant.nl/'] | ||
Line 107: | Line 108: | ||
# try to open URL ... | # try to open URL ... | ||
try: | try: | ||
request = urllib2.Request(target | request = urllib2.Request(target, None, {'User-Agent': useragent}) | ||
f=urllib2.urlopen(request) | f=urllib2.urlopen(request) | ||
Line 120: | Line 120: | ||
for link in CSSSelector('a[href]')(page): | for link in CSSSelector('a[href]')(page): | ||
href = urlparse.urljoin(f.geturl(), link.attrib['href']) | href = urlparse.urljoin(f.geturl(), link.attrib['href']) | ||
if href.split(':')[0] == 'http': # No | if href.split(':')[0] == 'http' and href != target: # No useless links | ||
links.append(href) | links.append(href) | ||
if links: # Anything left? | if links: # Anything left? | ||
Line 126: | Line 126: | ||
for elt in CSSSelector('img[src]')(page): | for elt in CSSSelector('img[src]')(page): | ||
href = urlparse.urljoin(f.geturl(), elt.attrib['src']) | href = urlparse.urljoin(f.geturl(), elt.attrib['src']) | ||
remotefile = urllib2.urlopen( | request = urllib2.Request(href, None, {'User-Agent': useragent}) | ||
remotefile = urllib2.urlopen(request) | |||
print 'downloading ' + href | print 'downloading ' + href | ||
localfile = open('dump/'+href.split('/')[-1], "wb") | localfile = open('dump/'+href.split('/')[-1], "wb") | ||
Line 132: | Line 133: | ||
localfile.close() | localfile.close() | ||
except ValueError | except IOError: | ||
print " | print "Ooops" | ||
except ValueError: | |||
print "Ooops" | |||
except AssertionError: | |||
print "Ooops" | |||
# ... catch HTTP and URL errors | # ... catch HTTP and URL errors |
Revision as of 14:12, 14 January 2011
Opening an network connection with urllib2
import urllib2
request = urllib2.Request("http://www.volkskrant.nl/")
f=urllib2.urlopen(request)
print f.geturl()
print f.info()
print f.read()
Some sites require that you set the "User-Agent" header.
import urllib2
request = urllib2.Request("http://www.volkskrant.nl/")
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)
Get the URL of all the links in a page and jump to a random page
import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector
targets = ['http://www.volkskrant.nl/']
while True:
target = random.choice(targets)
print '*** '+target+' ***'
request = urllib2.Request(target)
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = parser.parse(f)
targets = []
for links in CSSSelector('a[href]')(page):
href = urlparse.urljoin(f.geturl(), links.attrib['href'])
if href.split(':')[0] == 'http':
targets.append(href)
Get the URL of all the links in a page and jump to a random page AND BE SMART
import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector
targets = ['http://www.volkskrant.nl/']
while True:
target = random.choice(targets)
print '*** '+target+' ***'
# try to open URL ...
try:
request = urllib2.Request(target)
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)
# Is it really something that I can parse? srsly?
try:
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = parser.parse(f)
except ValueError, err:
print "Value Error:", err, target
links = []
if CSSSelector('a[href]')(page): # Any links for me?
for link in CSSSelector('a[href]')(page):
href = urlparse.urljoin(f.geturl(), link.attrib['href'])
if href.split(':')[0] == 'http': # No js links
links.append(href)
if links: # Anything left?
targets = links
# ... catch HTTP and URL errors
except urllib2.HTTPError, err:
print "HTTP Error:",err.code , target
print "trying other URL"
except urllib2.URLError, err:
print "URL Error:",err.reason , target
print "trying other URL"
Same as above and grab all the pictures found on each page
import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
targets = ['http://www.volkskrant.nl/']
while True:
target = random.choice(targets)
print '*** '+target+' ***'
# try to open URL ...
try:
request = urllib2.Request(target, None, {'User-Agent': useragent})
f=urllib2.urlopen(request)
# Is it really something that I can parse? srsly?
try:
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = parser.parse(f)
links = []
if CSSSelector('a[href]')(page): # Any links for me?
for link in CSSSelector('a[href]')(page):
href = urlparse.urljoin(f.geturl(), link.attrib['href'])
if href.split(':')[0] == 'http' and href != target: # No useless links
links.append(href)
if links: # Anything left?
targets = links
for elt in CSSSelector('img[src]')(page):
href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
request = urllib2.Request(href, None, {'User-Agent': useragent})
remotefile = urllib2.urlopen(request)
print 'downloading ' + href
localfile = open('dump/'+href.split('/')[-1], "wb")
localfile.write(remotefile.read())
localfile.close()
except IOError:
print "Ooops"
except ValueError:
print "Ooops"
except AssertionError:
print "Ooops"
# ... catch HTTP and URL errors
except urllib2.HTTPError, err:
print "HTTP Error:",err.code , target
print "trying other URL"
except urllib2.URLError, err:
print "URL Error:",err.reason , target
print "trying other URL"