Simple Web Spider in Python: Difference between revisions
No edit summary |
|||
Line 44: | Line 44: | ||
localfile.close() | localfile.close() | ||
</source> | |||
== Get the URL of all the links in a page and jump to a random page == | |||
<source lang="python"> | |||
import random | |||
import urllib2, urlparse, html5lib, lxml | |||
from lxml.cssselect import CSSSelector | |||
targets = ['http://www.volkskrant.nl/'] | |||
while True: | |||
target = random.choice(targets) | |||
print '*** '+target+' ***' | |||
request = urllib2.Request(target) | |||
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5") | |||
f=urllib2.urlopen(request) | |||
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) | |||
page = parser.parse(f) | |||
targets = [] | |||
for links in CSSSelector('a[href]')(page): | |||
href = urlparse.urljoin(f.geturl(), links.attrib['href']) | |||
if href.split(':')[0] == 'http': | |||
targets.append(href) | |||
</source> | </source> | ||
[[Category:Cookbook]] | [[Category:Cookbook]] |
Revision as of 21:12, 12 January 2011
Opening an network connection with urllib2
import urllib2
request = urllib2.Request("http://www.volkskrant.nl/")
f=urllib2.urlopen(request)
print f.geturl()
print f.info()
print f.read()
Some sites require that you set the "User-Agent" header.
import urllib2
request = urllib2.Request("http://www.volkskrant.nl/")
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)
Get the URL of all images on a page and download them in a folder
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector
request = urllib2.Request("http://www.volkskrant.nl/")
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)
# f.geturl(), f.info()
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = parser.parse(f)
for elt in CSSSelector('img[src]')(page):
href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
print 'downloading ' + href
localfile = open('dump/'+href.split('/')[-1], "wb")
localfile.write(remotefile.read())
localfile.close()
Get the URL of all the links in a page and jump to a random page
import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector
targets = ['http://www.volkskrant.nl/']
while True:
target = random.choice(targets)
print '*** '+target+' ***'
request = urllib2.Request(target)
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = parser.parse(f)
targets = []
for links in CSSSelector('a[href]')(page):
href = urlparse.urljoin(f.geturl(), links.attrib['href'])
if href.split(':')[0] == 'http':
targets.append(href)