User:Mirjam Dissel/image scraper

From XPUB & Lens-Based wiki

Displaying photo urls from Flicrk based on simple English dictionary in file a.txt
Next step: download files

flickrDownload.py

#!/usr/bin/env python
"""Usage: python flickrDownload.py TAGS
TAGS is a space delimited list of tags
 
Created by Matt Warren on 2009-09-08.
Copyright (c) 2009 HalOtis.com. All rights reserved.
"""
import sys
import shutil
import urllib
import random
import flickr
 
NUMBER_OF_IMAGES = 20
 
#this is slow
def get_urls_for_tags(tags, number):
    photos = flickr.photos_search(tags=tags, tag_mode='all', per_page=number)
    urls = []
    for photo in photos:
        try:
            urls.append(photo.getURL(size='Large', urlType='source'))
        except:
            continue
    return urls
 
def download_images(urls):
    for url in urls:
        file, mime = urllib.urlretrieve(url)
        name = url.split('/')[-1]
        print name
        shutil.copy(file, './'+name)
 
def main(*argv):
    args = argv[1:]
    if len(args) == 0:
        print "You must specify at least one tag"
        return 1
 
    tags = [item for item in args]
 
    urls = get_urls_for_tags(tags, NUMBER_OF_IMAGES)
    download_images(urls)

if __name__ == '__main__':
    #sys.exit(main(*sys.argv))

    words = open("a.txt").read().split()

    for word in words:
        urls = get_urls_for_tags(word, 20)
        url = random.choice(urls)        
        print word, url




Download pictures from Flickr's Interestingness page (last 7 days interesting pictures)

interestingness.py

#!/usr/bin/python
 
import simplejson
import urllib
import random
import os

API_KEY="api_key_here"
FLICKR_REST="http://api.flickr.com/services/rest/"
 
methods={
    "getList":"flickr.interestingness.getList",
    "getSizes":"flickr.photos.getSizes",
    }
 
def build_query(method,data=None):
    retval = FLICKR_REST + "?"
    base_params = urllib.urlencode({
            "format":"json",
            "method":methods[method],
            "api_key":API_KEY,
            })
    if data is not None:
        extra_params = urllib.urlencode(data)
    else:
        extra_params = ""
    retval += base_params + "&" + extra_params
    return retval
 
def do_query(method,data=None):
    conn = urllib.urlopen(build_query(method,data))
    data = conn.read()
    data =data[:-1].replace("jsonFlickrApi(","")
    try:
        retval = simplejson.loads(data)
    except:
        print data
        retval = {}
    return retval

photofeed = do_query("getList")

targetfolder = "flickrphotos/"

if not os.path.isdir(targetfolder):
	os.mkdir(targetfolder)

for i in range(100):
	photo = random.choice(photofeed['photos']['photo'])
	photosizes = do_query("getSizes",(("photo_id",photo['id']),))
 
	photoUrl = photosizes['sizes']['size'][-1]['source']
	filename = photoUrl.split('/')[-1]
	command = "wget {0}".format(photoUrl)
	os.chdir(targetfolder)
	if os.path.isfile(filename):
		print "NOT DOWNLOADING -- PHOTO ALREADY EXISTS"
	else:
		os.system(command)
 	os.chdir("..")

Download pictures from Flickr's most recent page

mostrecent.py

#!/usr/bin/python
 
import simplejson
import urllib
import random
import os

API_KEY="api_key_here"
FLICKR_REST="http://api.flickr.com/services/rest/"
 
methods={
    "getList":"flickr.photos.getRecent",
    "getSizes":"flickr.photos.getSizes",
    }
 
def build_query(method,data=None):
    retval = FLICKR_REST + "?"
    base_params = urllib.urlencode({
            "format":"json",
            "method":methods[method],
            "api_key":API_KEY,
            })
    if data is not None:
        extra_params = urllib.urlencode(data)
    else:
        extra_params = ""
    retval += base_params + "&" + extra_params
    return retval
 
def do_query(method,data=None):
    conn = urllib.urlopen(build_query(method,data))
    data = conn.read()
    data =data[:-1].replace("jsonFlickrApi(","")
    try:
        retval = simplejson.loads(data)
    except:
        print data
        retval = {}
    return retval

photofeed = do_query("getList")

targetfolder = "flickrphotosrecent/"

if not os.path.isdir(targetfolder):
	os.mkdir(targetfolder)

for i in range(100):
	photo = random.choice(photofeed['photos']['photo'])
	photosizes = do_query("getSizes",(("photo_id",photo['id']),))
 
	photoUrl = photosizes['sizes']['size'][-1]['source']
	filename = photoUrl.split('/')[-1]
	command = "wget {0}".format(photoUrl)
	os.chdir(targetfolder)
	if os.path.isfile(filename):
		print "NOT DOWNLOADING -- PHOTO ALREADY EXISTS"
	else:
		os.system(command)
 	os.chdir("..")