Building a web crawler with Django
An example of using Django to crawl Flickr.
(Compact instructions: create a Django project (called crawl) and an application (called flickr). Setup database (sqlite3) and switch on the standard admin options.)
Models
Models are defined for a Photo, a Comment, and Exif data.
models.py (inside flickr app)
from django.db import models
class Photo (models.Model):
flickrid = models.CharField(max_length=255, blank=True)
title = models.CharField(max_length=255, blank=True)
description = models.TextField(blank=True)
page_url = models.URLField(blank=True, verify_exists=False)
# the necessary info to construct the image URLs
server = models.CharField(max_length=255, blank=True)
farm = models.CharField(max_length=255, blank=True)
secret = models.CharField(max_length=255, blank=True)
def __unicode__ (self):
return str(self.flickrid)
def image_url (self, code="t"):
return "http://farm%(farm)s.static.flickr.com/%(server)s/%(id)s_%(secret)s_%(code)s.jpg" % {
'farm': self.farm,
'server': self.server,
'secret': self.secret,
'id': self.flickrid,
'code': code
}
def admin_thumbnail (self):
return '<img src="' + self.image_url(code="t") + '" />'
admin_thumbnail.allow_tags = True
class Exif (models.Model):
photo = models.ForeignKey(Photo, related_name="exif")
tag = models.CharField(max_length=255)
tagspace = models.CharField(max_length=255)
content = models.CharField(max_length=255)
class Comment (models.Model):
photo = models.ForeignKey(Photo, related_name="comments")
authorid = models.CharField(max_length=255)
authorname = models.CharField(max_length=255)
body = models.TextField(blank=True)
def __unicode__ (self):
return "Comment on %s" % self.photo.flickrid
Admin
A custom admin makes the automatically generated admin views much more useful.
admin.py (inside flickr app)
from django.contrib import admin
from models import *
class PhotoAdmin(admin.ModelAdmin):
list_display = ("flickrid", "admin_thumbnail", "title", "description", "page_url")
admin.site.register(Photo, PhotoAdmin)
class CommentAdmin(admin.ModelAdmin):
list_display = ("photo", "body")
admin.site.register(Comment, CommentAdmin)
class ExifAdmin(admin.ModelAdmin):
list_display = ("photo", "tagspace", "tag", "content")
list_filter = ("tagspace", "tag", )
search_fields = ("tag", "content", "tagspace")
admin.site.register(Exif, ExifAdmin)
Crawler
Finally, the actual crawler script. It runs in such a way that database records only get created the first time a photo is encountered. In this way, a simple crawl can be performed by periodically running the script (say using a cron job to run every hour). Flickr returns the latest 100 images matching the request.
To run this script "standalone" [1], it should be saved outside of the Django project (for instance in the folder containing the "crawl" project, and the DJANGO_SETTINGS_MODULE variable should be set in the environment. So:
export DJANGO_SETTINGS_MODULE=crawl.settings python crawler.py
crawler.py (in same folder as the project, not inside the project)
from crawl.flickr.models import *
import urllib2, json
from apikey import apikey
url = "http://api.flickr.com/services/rest/?method=flickr.photos.search&tags=tomato&format=json&nojsoncallback=1&api_key=" + apikey
j = json.load(urllib2.urlopen(url))
for p in j['photos']['photo']:
# CREATE/LOOKUP A DATABASE PHOTO OBJECT
(photo, created) = Photo.objects.get_or_create(flickrid=p['id'])
# SKIP AHEAD IF WE HAVE THIS ONE ALREADY
if not created: continue
print 'Processing photo: "%s"' % p['title']
# flickr.photos.getInfo
url = "http://api.flickr.com/services/rest/?method=flickr.photos.getInfo&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
info = json.load(urllib2.urlopen(url))
# photo.flickrid = p['id']
photo.title = info['photo']['title']['_content']
photo.description = info['photo']['description']['_content']
photo.page_url = info['photo']['urls']['url'][0]['_content']
photo.farm = info['photo']['farm']
photo.server = info['photo']['server']
photo.secret = info['photo']['secret']
# comments
# One problem with this script is that comments are only ever checked once (when the image is first seen)
# (so new comments may be missing)
numcomments = int(info['photo']['comments']['_content'])
if numcomments:
print " reading comments (%d)..." % numcomments
url = "http://api.flickr.com/services/rest/?method=flickr.photos.comments.getList&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
comments = json.load(urllib2.urlopen(url))
for c in comments['comments']['comment']:
comment = Comment(photo=photo)
comment.body = c['_content']
comment.authorid = c['author']
comment.authorname = c['authorname']
comment.save()
# EXIF
url = "http://api.flickr.com/services/rest/?method=flickr.photos.getExif&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
exif = json.load(urllib2.urlopen(url))
try:
for e in exif['photo']['exif']:
data = Exif(photo=photo)
data.tag = e['tag']
data.tagspace = e['tagspace']
data.content = e['raw']['_content']
data.save()
except KeyError:
# this happens when the image has no exif data
print " no exif data"
# finally remember to commit the changes to the database
photo.save()