Building a web crawler with Django: Difference between revisions
(Created page with "An example of using Django to crawl Flickr. == Models == Models are defined for a Photo, a Comment, and Exif data. models.py <source lang="python"> from django.db import model...") |
|||
(11 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
An example of using Django to crawl Flickr. | An example of using Django to crawl Flickr. | ||
(Compact instructions: create a Django project (called crawl) and an application (called flickr). Setup database (sqlite3) and switch on the standard admin options.) | |||
== Models == | == Models == | ||
Line 5: | Line 7: | ||
Models are defined for a Photo, a Comment, and Exif data. | Models are defined for a Photo, a Comment, and Exif data. | ||
models.py | models.py (inside flickr app) | ||
<source lang="python"> | <source lang="python"> | ||
from django.db import models | from django.db import models | ||
Line 56: | Line 58: | ||
A custom admin makes the automatically generated admin views much more useful. | A custom admin makes the automatically generated admin views much more useful. | ||
admin.py | admin.py (inside flickr app) | ||
<source lang="python"> | <source lang="python"> | ||
from django.contrib import admin | from django.contrib import admin | ||
Line 75: | Line 77: | ||
admin.site.register(Exif, ExifAdmin) | admin.site.register(Exif, ExifAdmin) | ||
</source> | </source> | ||
== Crawler == | |||
Finally, the actual crawler script. It runs in such a way that database records only get created the first time a photo is encountered. In this way, a simple crawl can be performed by periodically running the script (say using a cron job to run every hour). Flickr returns the latest 100 images matching the request. | |||
* http://www.flickr.com/services/api/ | |||
* http://www.flickr.com/services/api/flickr.photos.search.html | |||
To run this script "standalone" [http://www.b-list.org/weblog/2007/sep/22/standalone-django-scripts/], it should be saved outside of the Django project (for instance in the folder containing the "crawl" project, and the '''DJANGO_SETTINGS_MODULE''' variable should be set in the environment. So: | |||
export DJANGO_SETTINGS_MODULE=crawl.settings | |||
python crawler.py | |||
crawler.py (in same folder as the project, '''not''' inside the project) | |||
<source lang="python"> | |||
from crawl.flickr.models import * | |||
import urllib2, json | |||
from apikey import apikey | |||
url = "http://api.flickr.com/services/rest/?method=flickr.photos.search&tags=tomato&format=json&nojsoncallback=1&api_key=" + apikey | |||
j = json.load(urllib2.urlopen(url)) | |||
for p in j['photos']['photo']: | |||
# CREATE/LOOKUP A DATABASE PHOTO OBJECT | |||
(photo, created) = Photo.objects.get_or_create(flickrid=p['id']) | |||
# SKIP AHEAD IF WE HAVE THIS ONE ALREADY | |||
if not created: continue | |||
print 'Processing photo: "%s"' % p['title'] | |||
# flickr.photos.getInfo | |||
url = "http://api.flickr.com/services/rest/?method=flickr.photos.getInfo&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey | |||
info = json.load(urllib2.urlopen(url)) | |||
# photo.flickrid = p['id'] | |||
photo.title = info['photo']['title']['_content'] | |||
photo.description = info['photo']['description']['_content'] | |||
photo.page_url = info['photo']['urls']['url'][0]['_content'] | |||
photo.farm = info['photo']['farm'] | |||
photo.server = info['photo']['server'] | |||
photo.secret = info['photo']['secret'] | |||
# comments | |||
# One problem with this script is that comments are only ever checked once (when the image is first seen) | |||
# (so new comments may be missing) | |||
numcomments = int(info['photo']['comments']['_content']) | |||
if numcomments: | |||
print " reading comments (%d)..." % numcomments | |||
url = "http://api.flickr.com/services/rest/?method=flickr.photos.comments.getList&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey | |||
comments = json.load(urllib2.urlopen(url)) | |||
for c in comments['comments']['comment']: | |||
comment = Comment(photo=photo) | |||
comment.body = c['_content'] | |||
comment.authorid = c['author'] | |||
comment.authorname = c['authorname'] | |||
comment.save() | |||
# EXIF | |||
url = "http://api.flickr.com/services/rest/?method=flickr.photos.getExif&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey | |||
exif = json.load(urllib2.urlopen(url)) | |||
try: | |||
for e in exif['photo']['exif']: | |||
data = Exif(photo=photo) | |||
data.tag = e['tag'] | |||
data.tagspace = e['tagspace'] | |||
data.content = e['raw']['_content'] | |||
data.save() | |||
except KeyError: | |||
# this happens when the image has no exif data | |||
print " no exif data" | |||
# finally remember to commit the changes to the database | |||
photo.save() | |||
</source> | |||
[[Category:Cookbook]] [[Category:Django]] [[Category: Web]] [[Category: Crawler]] |
Latest revision as of 18:06, 27 January 2011
An example of using Django to crawl Flickr.
(Compact instructions: create a Django project (called crawl) and an application (called flickr). Setup database (sqlite3) and switch on the standard admin options.)
Models
Models are defined for a Photo, a Comment, and Exif data.
models.py (inside flickr app)
from django.db import models
class Photo (models.Model):
flickrid = models.CharField(max_length=255, blank=True)
title = models.CharField(max_length=255, blank=True)
description = models.TextField(blank=True)
page_url = models.URLField(blank=True, verify_exists=False)
# the necessary info to construct the image URLs
server = models.CharField(max_length=255, blank=True)
farm = models.CharField(max_length=255, blank=True)
secret = models.CharField(max_length=255, blank=True)
def __unicode__ (self):
return str(self.flickrid)
def image_url (self, code="t"):
return "http://farm%(farm)s.static.flickr.com/%(server)s/%(id)s_%(secret)s_%(code)s.jpg" % {
'farm': self.farm,
'server': self.server,
'secret': self.secret,
'id': self.flickrid,
'code': code
}
def admin_thumbnail (self):
return '<img src="' + self.image_url(code="t") + '" />'
admin_thumbnail.allow_tags = True
class Exif (models.Model):
photo = models.ForeignKey(Photo, related_name="exif")
tag = models.CharField(max_length=255)
tagspace = models.CharField(max_length=255)
content = models.CharField(max_length=255)
class Comment (models.Model):
photo = models.ForeignKey(Photo, related_name="comments")
authorid = models.CharField(max_length=255)
authorname = models.CharField(max_length=255)
body = models.TextField(blank=True)
def __unicode__ (self):
return "Comment on %s" % self.photo.flickrid
Admin
A custom admin makes the automatically generated admin views much more useful.
admin.py (inside flickr app)
from django.contrib import admin
from models import *
class PhotoAdmin(admin.ModelAdmin):
list_display = ("flickrid", "admin_thumbnail", "title", "description", "page_url")
admin.site.register(Photo, PhotoAdmin)
class CommentAdmin(admin.ModelAdmin):
list_display = ("photo", "body")
admin.site.register(Comment, CommentAdmin)
class ExifAdmin(admin.ModelAdmin):
list_display = ("photo", "tagspace", "tag", "content")
list_filter = ("tagspace", "tag", )
search_fields = ("tag", "content", "tagspace")
admin.site.register(Exif, ExifAdmin)
Crawler
Finally, the actual crawler script. It runs in such a way that database records only get created the first time a photo is encountered. In this way, a simple crawl can be performed by periodically running the script (say using a cron job to run every hour). Flickr returns the latest 100 images matching the request.
To run this script "standalone" [1], it should be saved outside of the Django project (for instance in the folder containing the "crawl" project, and the DJANGO_SETTINGS_MODULE variable should be set in the environment. So:
export DJANGO_SETTINGS_MODULE=crawl.settings python crawler.py
crawler.py (in same folder as the project, not inside the project)
from crawl.flickr.models import *
import urllib2, json
from apikey import apikey
url = "http://api.flickr.com/services/rest/?method=flickr.photos.search&tags=tomato&format=json&nojsoncallback=1&api_key=" + apikey
j = json.load(urllib2.urlopen(url))
for p in j['photos']['photo']:
# CREATE/LOOKUP A DATABASE PHOTO OBJECT
(photo, created) = Photo.objects.get_or_create(flickrid=p['id'])
# SKIP AHEAD IF WE HAVE THIS ONE ALREADY
if not created: continue
print 'Processing photo: "%s"' % p['title']
# flickr.photos.getInfo
url = "http://api.flickr.com/services/rest/?method=flickr.photos.getInfo&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
info = json.load(urllib2.urlopen(url))
# photo.flickrid = p['id']
photo.title = info['photo']['title']['_content']
photo.description = info['photo']['description']['_content']
photo.page_url = info['photo']['urls']['url'][0]['_content']
photo.farm = info['photo']['farm']
photo.server = info['photo']['server']
photo.secret = info['photo']['secret']
# comments
# One problem with this script is that comments are only ever checked once (when the image is first seen)
# (so new comments may be missing)
numcomments = int(info['photo']['comments']['_content'])
if numcomments:
print " reading comments (%d)..." % numcomments
url = "http://api.flickr.com/services/rest/?method=flickr.photos.comments.getList&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
comments = json.load(urllib2.urlopen(url))
for c in comments['comments']['comment']:
comment = Comment(photo=photo)
comment.body = c['_content']
comment.authorid = c['author']
comment.authorname = c['authorname']
comment.save()
# EXIF
url = "http://api.flickr.com/services/rest/?method=flickr.photos.getExif&photo_id=" + p['id'] + "&format=json&nojsoncallback=1&api_key=" + apikey
exif = json.load(urllib2.urlopen(url))
try:
for e in exif['photo']['exif']:
data = Exif(photo=photo)
data.tag = e['tag']
data.tagspace = e['tagspace']
data.content = e['raw']['_content']
data.save()
except KeyError:
# this happens when the image has no exif data
print " no exif data"
# finally remember to commit the changes to the database
photo.save()