Building a web crawler with Django
An example of using Django to crawl Flickr.
(Compact instructions: create a Django project (called crawl) and an application (called flickr). Setup database (sqlite3) and switch on the standard admin options.)
Models
Models are defined for a Photo, a Comment, and Exif data.
models.py (inside flickr app)
from django.db import models
class Photo (models.Model):
flickrid = models.CharField(max_length=255, blank=True)
title = models.CharField(max_length=255, blank=True)
description = models.TextField(blank=True)
page_url = models.URLField(blank=True, verify_exists=False)
# the necessary info to construct the image URLs
server = models.CharField(max_length=255, blank=True)
farm = models.CharField(max_length=255, blank=True)
secret = models.CharField(max_length=255, blank=True)
def __unicode__ (self):
return str(self.flickrid)
def image_url (self, code="t"):
return "http://farm%(farm)s.static.flickr.com/%(server)s/%(id)s_%(secret)s_%(code)s.jpg" % {
'farm': self.farm,
'server': self.server,
'secret': self.secret,
'id': self.flickrid,
'code': code
}
def admin_thumbnail (self):
return '<img src="' + self.image_url(code="t") + '" />'
admin_thumbnail.allow_tags = True
class Exif (models.Model):
photo = models.ForeignKey(Photo, related_name="exif")
tag = models.CharField(max_length=255)
tagspace = models.CharField(max_length=255)
content = models.CharField(max_length=255)
class Comment (models.Model):
photo = models.ForeignKey(Photo, related_name="comments")
authorid = models.CharField(max_length=255)
authorname = models.CharField(max_length=255)
body = models.TextField(blank=True)
def __unicode__ (self):
return "Comment on %s" % self.photo.flickrid
Admin
A custom admin makes the automatically generated admin views much more useful.
admin.py
from django.contrib import admin
from models import *
class PhotoAdmin(admin.ModelAdmin):
list_display = ("flickrid", "admin_thumbnail", "title", "description", "page_url")
admin.site.register(Photo, PhotoAdmin)
class CommentAdmin(admin.ModelAdmin):
list_display = ("photo", "body")
admin.site.register(Comment, CommentAdmin)
class ExifAdmin(admin.ModelAdmin):
list_display = ("photo", "tagspace", "tag", "content")
list_filter = ("tagspace", "tag", )
search_fields = ("tag", "content", "tagspace")
admin.site.register(Exif, ExifAdmin)
Crawler
Finally, the actual crawler script. It runs in such a way that database records only get created the first time a photo is encountered. In this way, a simple crawl can be performed by periodically running the script (say using a cron job to run every hour). Flickr returns the latest 100 images matching the request.
To run this script, it should be saved outside of the Django project (for instance in the folder containing the "crawl" project, and the DJANGO_SETTINGS_MODULE variable should be set in the environment. So:
export DJANGO_SETTINGS_MODULE=crawl.settings python crawler.py
crawler.py
from crawl.flickr.models import *
import urllib2, json
from apikey import apikey
url = "http://api.flickr.com/services/rest/?method=flickr.photos.search&tags=tomato&format=json&nojsoncallback=1&api_key=" + apikey
j = json.load(urllib2.urlopen(url))
for p in j['photos']['photo']:
# CREATE/LOOKUP A DATABASE PHOTO OBJECT
(photo, created) = Photo.objects.get_or_create(flickrid=p['id'])
# SKIP AHEAD IF WE HAVE THIS ONE ALREADY
if not created: continue
print 'Processing photo: "%s"' % p['title']
# flickr.photos.getInfo
url = "http://api.flickr.com/services/rest/?method=flickr.photos.getInfo&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
info = json.load(urllib2.urlopen(url))
# photo.flickrid = p['id']
photo.title = info['photo']['title']['_content']
photo.description = info['photo']['description']['_content']
photo.page_url = info['photo']['urls']['url'][0]['_content']
photo.farm = info['photo']['farm']
photo.server = info['photo']['server']
photo.secret = info['photo']['secret']
# comments
# One problem with this script is that comments are only ever checked once (when the image is first seen)
# (so new comments may be missing)
numcomments = int(info['photo']['comments']['_content'])
if numcomments:
print " reading comments (%d)..." % numcomments
url = "http://api.flickr.com/services/rest/?method=flickr.photos.comments.getList&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
comments = json.load(urllib2.urlopen(url))
for c in comments['comments']['comment']:
comment = Comment(photo=photo)
comment.body = c['_content']
comment.authorid = c['author']
comment.authorname = c['authorname']
comment.save()
# EXIF
url = "http://api.flickr.com/services/rest/?method=flickr.photos.getExif&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
exif = json.load(urllib2.urlopen(url))
try:
for e in exif['photo']['exif']:
data = Exif(photo=photo)
data.tag = e['tag']
data.tagspace = e['tagspace']
data.content = e['raw']['_content']
data.save()
except KeyError:
# this happens when the image has no exif data
print " no exif data"
# finally remember to commit the changes to the database
photo.save()