User:Laurier Rochon/prototyping/gregthecraigspider

From XPUB & Lens-Based wiki

Greg the Craig Spider

Simple python spider script adapted from Aymeric's original.

  • Go to the first page of New York's Craiglist apartment listings (100 listings)
  • Gather all links on that page using CSSSelector('a[href]')(page)
  • Visit those links, grab all pictures
  • Rename each picture with the price of the apartment (i.e. $2899)
  • Quickly get an idea of how overpriced NYC apartments are!

Soft

import random
import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector
 
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
targets = ['http://newyork.craigslist.org/aap/']
c = 0

while c<2:
	for target in targets:
		print '*** '+target+' ***'
		try:
			request = urllib2.Request(target, None, {'User-Agent': useragent})
			f=urllib2.urlopen(request)
			t = f.read(3000)
			try:
				parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
				page = parser.parse(f)

				links = []

				if CSSSelector('a[href]')(page): # Any links for me?
					for link in CSSSelector('a[href]')(page):
						href = urlparse.urljoin(f.geturl(), link.attrib['href'])
						if href.split(':')[0] == 'http' and href != target and href.split('.')[-1] == "html":
							links.append(href)

				if links:
					targets = links
					if c!=0:
						if t.find('<h2>') != -1:
							title = t.split('<h2>')[1]
							title = title.split(' ')[0]
						else:
							title = 'free'
						n=0
						for elt in CSSSelector('img[src]')(page):
							href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
							request = urllib2.Request(href, None, {'User-Agent': useragent})
							remotefile = urllib2.urlopen(request)
							print 'downloading ' + href
							ext = href.split('.')[-1]
							localfile = open('imgs/'+title+'_'+str(n)+'.'+ext, "wb")
							#localfile = open('imgs/'+href.split('/')[-1], "wb")
							localfile.write(remotefile.read())
							localfile.close()
							n+=1


			except IOError:
				print "Ooops"

			except ValueError:
				print "Ooops"

			except AssertionError:
				print "Ooops"		
			c+=1
		except urllib2.HTTPError, err:
			print "HTTP Error:",err.code , target
			print "trying other URL"
		except urllib2.URLError, err:
			print "URL Error:",err.reason , target
			print "trying other URL"