User:Laurier Rochon/prototyping/gregthecraigspider
Greg the Craig Spider
Simple python spider script adapted from Aymeric's original.
- Go to the first page of New York's Craiglist apartment listings (100 listings)
- Gather all links on that page using CSSSelector('a[href]')(page)
- Visit those links, grab all pictures
- Rename each picture with the price of the apartment (i.e. $2899)
- Quickly get an idea of how overpriced NYC apartments are!
Soft
import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
targets = ['http://newyork.craigslist.org/aap/']
c = 0
while c<2:
for target in targets:
print '*** '+target+' ***'
try:
request = urllib2.Request(target, None, {'User-Agent': useragent})
f=urllib2.urlopen(request)
t = f.read(3000)
try:
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = parser.parse(f)
links = []
if CSSSelector('a[href]')(page): # Any links for me?
for link in CSSSelector('a[href]')(page):
href = urlparse.urljoin(f.geturl(), link.attrib['href'])
if href.split(':')[0] == 'http' and href != target and href.split('.')[-1] == "html":
links.append(href)
if links:
targets = links
if c!=0:
if t.find('<h2>') != -1:
title = t.split('<h2>')[1]
title = title.split(' ')[0]
else:
title = 'free'
n=0
for elt in CSSSelector('img[src]')(page):
href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
request = urllib2.Request(href, None, {'User-Agent': useragent})
remotefile = urllib2.urlopen(request)
print 'downloading ' + href
ext = href.split('.')[-1]
localfile = open('imgs/'+title+'_'+str(n)+'.'+ext, "wb")
#localfile = open('imgs/'+href.split('/')[-1], "wb")
localfile.write(remotefile.read())
localfile.close()
n+=1
except IOError:
print "Ooops"
except ValueError:
print "Ooops"
except AssertionError:
print "Ooops"
c+=1
except urllib2.HTTPError, err:
print "HTTP Error:",err.code , target
print "trying other URL"
except urllib2.URLError, err:
print "URL Error:",err.reason , target
print "trying other URL"