Building a web crawler with Django

From XPUB & Lens-Based wiki

An example of using Django to crawl Flickr.

(Compact instructions: create a Django project (called crawl) and an application (called flickr). Setup database (sqlite3) and switch on the standard admin options.)

Models

Models are defined for a Photo, a Comment, and Exif data.

models.py (inside flickr app)

from django.db import models

class Photo (models.Model):
    flickrid = models.CharField(max_length=255, blank=True)
    title = models.CharField(max_length=255, blank=True)
    description = models.TextField(blank=True)
    page_url = models.URLField(blank=True, verify_exists=False)

    # the necessary info to construct the image URLs
    server = models.CharField(max_length=255, blank=True)
    farm = models.CharField(max_length=255, blank=True)
    secret = models.CharField(max_length=255, blank=True)

    def __unicode__ (self):
        return str(self.flickrid)

    def image_url (self, code="t"):
        return "http://farm%(farm)s.static.flickr.com/%(server)s/%(id)s_%(secret)s_%(code)s.jpg" % {
            'farm': self.farm,
            'server': self.server,
            'secret': self.secret,
            'id': self.flickrid,
            'code': code
        }

    def admin_thumbnail (self):
        return '<img src="' + self.image_url(code="t") + '" />'
    admin_thumbnail.allow_tags = True

class Exif (models.Model):
    photo = models.ForeignKey(Photo, related_name="exif")
    tag = models.CharField(max_length=255)
    tagspace = models.CharField(max_length=255)
    content = models.CharField(max_length=255)
  
class Comment (models.Model):
    photo = models.ForeignKey(Photo, related_name="comments")
    authorid = models.CharField(max_length=255)
    authorname = models.CharField(max_length=255)
    body = models.TextField(blank=True)
    
    def __unicode__ (self):
        return "Comment on %s" % self.photo.flickrid

Admin

A custom admin makes the automatically generated admin views much more useful.

admin.py (inside flickr app)

from django.contrib import admin
from models import *

class PhotoAdmin(admin.ModelAdmin):
    list_display = ("flickrid", "admin_thumbnail", "title", "description", "page_url")
admin.site.register(Photo, PhotoAdmin)

class CommentAdmin(admin.ModelAdmin):
    list_display = ("photo", "body")
admin.site.register(Comment, CommentAdmin)

class ExifAdmin(admin.ModelAdmin):
    list_display = ("photo", "tagspace", "tag", "content")
    list_filter = ("tagspace", "tag", )
    search_fields = ("tag", "content", "tagspace")
admin.site.register(Exif, ExifAdmin)

Crawler

Finally, the actual crawler script. It runs in such a way that database records only get created the first time a photo is encountered. In this way, a simple crawl can be performed by periodically running the script (say using a cron job to run every hour). Flickr returns the latest 100 images matching the request.

To run this script, it should be saved outside of the Django project (for instance in the folder containing the "crawl" project, and the DJANGO_SETTINGS_MODULE variable should be set in the environment. So:

export DJANGO_SETTINGS_MODULE=crawl.settings
python crawler.py

[1]

crawler.py (in same folder as the project, not inside the project)

from crawl.flickr.models import *
import urllib2, json
from apikey import apikey

url = "http://api.flickr.com/services/rest/?method=flickr.photos.search&tags=tomato&format=json&nojsoncallback=1&api_key=" + apikey
j = json.load(urllib2.urlopen(url))

for p in j['photos']['photo']:
    # CREATE/LOOKUP A DATABASE PHOTO OBJECT
    (photo, created) = Photo.objects.get_or_create(flickrid=p['id'])
    # SKIP AHEAD IF WE HAVE THIS ONE ALREADY
    if not created: continue

    print 'Processing photo: "%s"' % p['title']

    # flickr.photos.getInfo
    url = "http://api.flickr.com/services/rest/?method=flickr.photos.getInfo&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
    info = json.load(urllib2.urlopen(url))
    
    # photo.flickrid = p['id']
    photo.title = info['photo']['title']['_content']
    photo.description = info['photo']['description']['_content']
    photo.page_url = info['photo']['urls']['url'][0]['_content']

    photo.farm = info['photo']['farm']
    photo.server = info['photo']['server']
    photo.secret = info['photo']['secret']
    
    # comments
    # One problem with this script is that comments are only ever checked once (when the image is first seen)
    # (so new comments may be missing)
    numcomments = int(info['photo']['comments']['_content'])
    if numcomments:
        print "    reading comments (%d)..." % numcomments
        url = "http://api.flickr.com/services/rest/?method=flickr.photos.comments.getList&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
        comments = json.load(urllib2.urlopen(url))
        for c in comments['comments']['comment']:
            comment = Comment(photo=photo)
            comment.body = c['_content']
            comment.authorid = c['author']
            comment.authorname = c['authorname']
            comment.save()

    # EXIF
    url = "http://api.flickr.com/services/rest/?method=flickr.photos.getExif&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
    exif = json.load(urllib2.urlopen(url))
    try:
        for e in exif['photo']['exif']:
            data = Exif(photo=photo)
            data.tag = e['tag']
            data.tagspace = e['tagspace']
            data.content = e['raw']['_content']
            data.save()
    except KeyError:
        # this happens when the image has no exif data
        print "    no exif data"

    # finally remember to commit the changes to the database
    photo.save()