Simple Web Spider in Python: Difference between revisions
No edit summary |
|||
(8 intermediate revisions by one other user not shown) | |||
Line 1: | Line 1: | ||
A web spider that starts at a given URL, and follows links | |||
== Opening an network connection with urllib2 == | == Opening an network connection with urllib2 == | ||
Line 22: | Line 24: | ||
</source> | </source> | ||
== Get the URL of all | == Get the URL of all the links in a page and jump to a random page == | ||
<source lang="python"> | |||
import random | |||
import urllib2, urlparse, html5lib, lxml | |||
from lxml.cssselect import CSSSelector | |||
targets = ['http://www.volkskrant.nl/'] | |||
while True: | |||
target = random.choice(targets) | |||
print '*** '+target+' ***' | |||
request = urllib2.Request(target) | |||
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") | |||
f=urllib2.urlopen(request) | |||
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) | |||
page = parser.parse(f) | |||
targets = [] | |||
for links in CSSSelector('a[href]')(page): | |||
href = urlparse.urljoin(f.geturl(), links.attrib['href']) | |||
if href.split(':')[0] == 'http': | |||
targets.append(href) | |||
</source> | |||
== Get the URL of all the links in a page and jump to a random page AND BE SMART == | |||
<source lang="python"> | |||
import random | |||
import urllib2, urlparse, html5lib, lxml | |||
from lxml.cssselect import CSSSelector | |||
targets = ['http://www.volkskrant.nl/'] | |||
while True: | |||
target = random.choice(targets) | |||
print '*** '+target+' ***' | |||
# try to open URL ... | |||
try: | |||
request = urllib2.Request(target) | |||
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") | |||
f=urllib2.urlopen(request) | |||
# Is it really something that I can parse? srsly? | |||
try: | |||
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) | |||
page = parser.parse(f) | |||
except ValueError, err: | |||
print "Value Error:", err, target | |||
links = [] | |||
if CSSSelector('a[href]')(page): # Any links for me? | |||
for link in CSSSelector('a[href]')(page): | |||
href = urlparse.urljoin(f.geturl(), link.attrib['href']) | |||
if href.split(':')[0] == 'http': # No js links | |||
links.append(href) | |||
if links: # Anything left? | |||
targets = links | |||
# ... catch HTTP and URL errors | |||
except urllib2.HTTPError, err: | |||
print "HTTP Error:",err.code , target | |||
print "trying other URL" | |||
except urllib2.URLError, err: | |||
print "URL Error:",err.reason , target | |||
print "trying other URL" | |||
</source> | |||
== Same as above and grab all the pictures found on each page AND BE SMARTER== | |||
<source lang="python"> | <source lang="python"> | ||
import random | |||
import urllib2, urlparse, html5lib, lxml | import urllib2, urlparse, html5lib, lxml | ||
from lxml.cssselect import CSSSelector | from lxml.cssselect import CSSSelector | ||
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101" | |||
request. | targets = ['http://www.volkskrant.nl/'] | ||
while True: | |||
target = random.choice(targets) | |||
print '*** '+target+' ***' | |||
# try to open URL ... | |||
try: | |||
request = urllib2.Request(target, None, {'User-Agent': useragent}) | |||
f=urllib2.urlopen(request) | |||
# Is it really something that I can parse? srsly? | |||
try: | |||
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) | |||
page = parser.parse(f) | |||
links = [] | |||
if CSSSelector('a[href]')(page): # Any links for me? | |||
for link in CSSSelector('a[href]')(page): | |||
href = urlparse.urljoin(f.geturl(), link.attrib['href']) | |||
if href.split(':')[0] == 'http' and href != target: # No useless links | |||
links.append(href) | |||
if links: # Anything left? | |||
targets = links | |||
for elt in CSSSelector('img[src]')(page): | |||
href = urlparse.urljoin(f.geturl(), elt.attrib['src']) | |||
request = urllib2.Request(href, None, {'User-Agent': useragent}) | |||
remotefile = urllib2.urlopen(request) | |||
print 'downloading ' + href | |||
localfile = open('dump/'+href.split('/')[-1], "wb") | |||
localfile.write(remotefile.read()) | |||
localfile.close() | |||
except IOError: | |||
print "Ooops" | |||
except ValueError: | |||
print "Ooops" | |||
except AssertionError: | |||
print "Ooops" | |||
# ... catch HTTP and URL errors | |||
except urllib2.HTTPError, err: | |||
print "HTTP Error:",err.code , target | |||
print "trying other URL" | |||
except urllib2.URLError, err: | |||
print "URL Error:",err.reason , target | |||
print "trying other URL" | |||
</source> | </source> | ||
[[Category:Cookbook]] | [[Category:Cookbook]] |
Latest revision as of 10:51, 2 December 2013
A web spider that starts at a given URL, and follows links
Opening an network connection with urllib2
import urllib2
request = urllib2.Request("http://www.volkskrant.nl/")
f=urllib2.urlopen(request)
print f.geturl()
print f.info()
print f.read()
Some sites require that you set the "User-Agent" header.
import urllib2
request = urllib2.Request("http://www.volkskrant.nl/")
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)
Get the URL of all the links in a page and jump to a random page
import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector
targets = ['http://www.volkskrant.nl/']
while True:
target = random.choice(targets)
print '*** '+target+' ***'
request = urllib2.Request(target)
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = parser.parse(f)
targets = []
for links in CSSSelector('a[href]')(page):
href = urlparse.urljoin(f.geturl(), links.attrib['href'])
if href.split(':')[0] == 'http':
targets.append(href)
Get the URL of all the links in a page and jump to a random page AND BE SMART
import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector
targets = ['http://www.volkskrant.nl/']
while True:
target = random.choice(targets)
print '*** '+target+' ***'
# try to open URL ...
try:
request = urllib2.Request(target)
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)
# Is it really something that I can parse? srsly?
try:
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = parser.parse(f)
except ValueError, err:
print "Value Error:", err, target
links = []
if CSSSelector('a[href]')(page): # Any links for me?
for link in CSSSelector('a[href]')(page):
href = urlparse.urljoin(f.geturl(), link.attrib['href'])
if href.split(':')[0] == 'http': # No js links
links.append(href)
if links: # Anything left?
targets = links
# ... catch HTTP and URL errors
except urllib2.HTTPError, err:
print "HTTP Error:",err.code , target
print "trying other URL"
except urllib2.URLError, err:
print "URL Error:",err.reason , target
print "trying other URL"
Same as above and grab all the pictures found on each page AND BE SMARTER
import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
targets = ['http://www.volkskrant.nl/']
while True:
target = random.choice(targets)
print '*** '+target+' ***'
# try to open URL ...
try:
request = urllib2.Request(target, None, {'User-Agent': useragent})
f=urllib2.urlopen(request)
# Is it really something that I can parse? srsly?
try:
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = parser.parse(f)
links = []
if CSSSelector('a[href]')(page): # Any links for me?
for link in CSSSelector('a[href]')(page):
href = urlparse.urljoin(f.geturl(), link.attrib['href'])
if href.split(':')[0] == 'http' and href != target: # No useless links
links.append(href)
if links: # Anything left?
targets = links
for elt in CSSSelector('img[src]')(page):
href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
request = urllib2.Request(href, None, {'User-Agent': useragent})
remotefile = urllib2.urlopen(request)
print 'downloading ' + href
localfile = open('dump/'+href.split('/')[-1], "wb")
localfile.write(remotefile.read())
localfile.close()
except IOError:
print "Ooops"
except ValueError:
print "Ooops"
except AssertionError:
print "Ooops"
# ... catch HTTP and URL errors
except urllib2.HTTPError, err:
print "HTTP Error:",err.code , target
print "trying other URL"
except urllib2.URLError, err:
print "URL Error:",err.reason , target
print "trying other URL"