User:Laurier Rochon/prototyping/gregthecraigspider

Greg the Craig Spider

Simple python spider script adapted from Aymeric's original.

Go to the first page of New York's Craiglist apartment listings (100 listings)
Gather all links on that page using CSSSelector('a[href]')(page)
Visit those links, grab all pictures
Rename each picture with the price of the apartment (i.e. $2899)
Quickly get an idea of how overpriced NYC apartments are!

Soft

import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector
 
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
targets = ['http://newyork.craigslist.org/aap/']
c = 0

while c<2:
	for target in targets:
		print '*** '+target+' ***'
		try:
			request = urllib2.Request(target, None, {'User-Agent': useragent})
			f=urllib2.urlopen(request)
			t = f.read(3000)
			try:
				parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
				page = parser.parse(f)

				links = []

				if CSSSelector('a[href]')(page): # Any links for me?
					for link in CSSSelector('a[href]')(page):
						href = urlparse.urljoin(f.geturl(), link.attrib['href'])
						if href.split(':')[0] == 'http' and href != target and href.split('.')[-1] == "html":
							links.append(href)

				if links:
					targets = links
					if c!=0:
						if t.find('<h2>') != -1:
							title = t.split('<h2>')[1]
							title = title.split(' ')[0]
						else:
							title = 'free'
						n=0
						for elt in CSSSelector('img[src]')(page):
							href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
							request = urllib2.Request(href, None, {'User-Agent': useragent})
							remotefile = urllib2.urlopen(request)
							print 'downloading ' + href
							ext = href.split('.')[-1]
							localfile = open('imgs/'+title+'_'+str(n)+'.'+ext, "wb")
							#localfile = open('imgs/'+href.split('/')[-1], "wb")
							localfile.write(remotefile.read())
							localfile.close()
							n+=1


			except IOError:
				print "Ooops"

			except ValueError:
				print "Ooops"

			except AssertionError:
				print "Ooops"		
			c+=1
		except urllib2.HTTPError, err:
			print "HTTP Error:",err.code , target
			print "trying other URL"
		except urllib2.URLError, err:
			print "URL Error:",err.reason , target
			print "trying other URL"