Building a web crawler with Django

From XPUB & Lens-Based wiki

An example of using Django to crawl Flickr.

(Compact instructions: create a Django project (called crawl) and an application (called flickr). Setup database (sqlite3) and switch on the standard admin options.)

Models

Models are defined for a Photo, a Comment, and Exif data.

models.py (inside flickr app)

from django.db import models

class Photo (models.Model):
    flickrid = models.CharField(max_length=255, blank=True)
    title = models.CharField(max_length=255, blank=True)
    description = models.TextField(blank=True)
    page_url = models.URLField(blank=True, verify_exists=False)

    # the necessary info to construct the image URLs
    server = models.CharField(max_length=255, blank=True)
    farm = models.CharField(max_length=255, blank=True)
    secret = models.CharField(max_length=255, blank=True)

    def __unicode__ (self):
        return str(self.flickrid)

    def image_url (self, code="t"):
        return "http://farm%(farm)s.static.flickr.com/%(server)s/%(id)s_%(secret)s_%(code)s.jpg" % {
            'farm': self.farm,
            'server': self.server,
            'secret': self.secret,
            'id': self.flickrid,
            'code': code
        }

    def admin_thumbnail (self):
        return '<img src="' + self.image_url(code="t") + '" />'
    admin_thumbnail.allow_tags = True

class Exif (models.Model):
    photo = models.ForeignKey(Photo, related_name="exif")
    tag = models.CharField(max_length=255)
    tagspace = models.CharField(max_length=255)
    content = models.CharField(max_length=255)
  
class Comment (models.Model):
    photo = models.ForeignKey(Photo, related_name="comments")
    authorid = models.CharField(max_length=255)
    authorname = models.CharField(max_length=255)
    body = models.TextField(blank=True)
    
    def __unicode__ (self):
        return "Comment on %s" % self.photo.flickrid

Admin

A custom admin makes the automatically generated admin views much more useful.

admin.py (inside flickr app)

from django.contrib import admin
from models import *

class PhotoAdmin(admin.ModelAdmin):
    list_display = ("flickrid", "admin_thumbnail", "title", "description", "page_url")
admin.site.register(Photo, PhotoAdmin)

class CommentAdmin(admin.ModelAdmin):
    list_display = ("photo", "body")
admin.site.register(Comment, CommentAdmin)

class ExifAdmin(admin.ModelAdmin):
    list_display = ("photo", "tagspace", "tag", "content")
    list_filter = ("tagspace", "tag", )
    search_fields = ("tag", "content", "tagspace")
admin.site.register(Exif, ExifAdmin)

Crawler

Finally, the actual crawler script. It runs in such a way that database records only get created the first time a photo is encountered. In this way, a simple crawl can be performed by periodically running the script (say using a cron job to run every hour). Flickr returns the latest 100 images matching the request.

To run this script "standalone" [1], it should be saved outside of the Django project (for instance in the folder containing the "crawl" project, and the DJANGO_SETTINGS_MODULE variable should be set in the environment. So:

export DJANGO_SETTINGS_MODULE=crawl.settings
python crawler.py


crawler.py (in same folder as the project, not inside the project)

from crawl.flickr.models import *
import urllib2, json
from apikey import apikey

url = "http://api.flickr.com/services/rest/?method=flickr.photos.search&tags=tomato&format=json&nojsoncallback=1&api_key=" + apikey
j = json.load(urllib2.urlopen(url))

for p in j['photos']['photo']:
    # CREATE/LOOKUP A DATABASE PHOTO OBJECT
    (photo, created) = Photo.objects.get_or_create(flickrid=p['id'])
    # SKIP AHEAD IF WE HAVE THIS ONE ALREADY
    if not created: continue

    print 'Processing photo: "%s"' % p['title']

    # flickr.photos.getInfo
    url = "http://api.flickr.com/services/rest/?method=flickr.photos.getInfo&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
    info = json.load(urllib2.urlopen(url))
    
    # photo.flickrid = p['id']
    photo.title = info['photo']['title']['_content']
    photo.description = info['photo']['description']['_content']
    photo.page_url = info['photo']['urls']['url'][0]['_content']

    photo.farm = info['photo']['farm']
    photo.server = info['photo']['server']
    photo.secret = info['photo']['secret']
    
    # comments
    # One problem with this script is that comments are only ever checked once (when the image is first seen)
    # (so new comments may be missing)
    numcomments = int(info['photo']['comments']['_content'])
    if numcomments:
        print "    reading comments (%d)..." % numcomments
        url = "http://api.flickr.com/services/rest/?method=flickr.photos.comments.getList&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
        comments = json.load(urllib2.urlopen(url))
        for c in comments['comments']['comment']:
            comment = Comment(photo=photo)
            comment.body = c['_content']
            comment.authorid = c['author']
            comment.authorname = c['authorname']
            comment.save()

    # EXIF
    url = "http://api.flickr.com/services/rest/?method=flickr.photos.getExif&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
    exif = json.load(urllib2.urlopen(url))
    try:
        for e in exif['photo']['exif']:
            data = Exif(photo=photo)
            data.tag = e['tag']
            data.tagspace = e['tagspace']
            data.content = e['raw']['_content']
            data.save()
    except KeyError:
        # this happens when the image has no exif data
        print "    no exif data"

    # finally remember to commit the changes to the database
    photo.save()