User:Mirjam Dissel/image scraper
Displaying photo urls from Flicrk based on simple English dictionary in file a.txt
Next step: download files
flickrDownload.py
#!/usr/bin/env python
"""Usage: python flickrDownload.py TAGS
TAGS is a space delimited list of tags
Created by Matt Warren on 2009-09-08.
Copyright (c) 2009 HalOtis.com. All rights reserved.
"""
import sys
import shutil
import urllib
import random
import flickr
NUMBER_OF_IMAGES = 20
#this is slow
def get_urls_for_tags(tags, number):
photos = flickr.photos_search(tags=tags, tag_mode='all', per_page=number)
urls = []
for photo in photos:
try:
urls.append(photo.getURL(size='Large', urlType='source'))
except:
continue
return urls
def download_images(urls):
for url in urls:
file, mime = urllib.urlretrieve(url)
name = url.split('/')[-1]
print name
shutil.copy(file, './'+name)
def main(*argv):
args = argv[1:]
if len(args) == 0:
print "You must specify at least one tag"
return 1
tags = [item for item in args]
urls = get_urls_for_tags(tags, NUMBER_OF_IMAGES)
download_images(urls)
if __name__ == '__main__':
#sys.exit(main(*sys.argv))
words = open("a.txt").read().split()
for word in words:
urls = get_urls_for_tags(word, 20)
url = random.choice(urls)
print word, url
Download pictures from Flickr's Interestingness page (last 7 days interesting pictures)
interestingness.py
#!/usr/bin/python
import simplejson
import urllib
import random
import os
API_KEY="api_key_here"
FLICKR_REST="http://api.flickr.com/services/rest/"
methods={
"getList":"flickr.interestingness.getList",
"getSizes":"flickr.photos.getSizes",
}
def build_query(method,data=None):
retval = FLICKR_REST + "?"
base_params = urllib.urlencode({
"format":"json",
"method":methods[method],
"api_key":API_KEY,
})
if data is not None:
extra_params = urllib.urlencode(data)
else:
extra_params = ""
retval += base_params + "&" + extra_params
return retval
def do_query(method,data=None):
conn = urllib.urlopen(build_query(method,data))
data = conn.read()
data =data[:-1].replace("jsonFlickrApi(","")
try:
retval = simplejson.loads(data)
except:
print data
retval = {}
return retval
photofeed = do_query("getList")
targetfolder = "flickrphotos/"
if not os.path.isdir(targetfolder):
os.mkdir(targetfolder)
for i in range(100):
photo = random.choice(photofeed['photos']['photo'])
photosizes = do_query("getSizes",(("photo_id",photo['id']),))
photoUrl = photosizes['sizes']['size'][-1]['source']
filename = photoUrl.split('/')[-1]
command = "wget {0}".format(photoUrl)
os.chdir(targetfolder)
if os.path.isfile(filename):
print "NOT DOWNLOADING -- PHOTO ALREADY EXISTS"
else:
os.system(command)
os.chdir("..")
Download pictures from Flickr's most recent page
mostrecent.py
#!/usr/bin/python
import simplejson
import urllib
import random
import os
API_KEY="api_key_here"
FLICKR_REST="http://api.flickr.com/services/rest/"
methods={
"getList":"flickr.photos.getRecent",
"getSizes":"flickr.photos.getSizes",
}
def build_query(method,data=None):
retval = FLICKR_REST + "?"
base_params = urllib.urlencode({
"format":"json",
"method":methods[method],
"api_key":API_KEY,
})
if data is not None:
extra_params = urllib.urlencode(data)
else:
extra_params = ""
retval += base_params + "&" + extra_params
return retval
def do_query(method,data=None):
conn = urllib.urlopen(build_query(method,data))
data = conn.read()
data =data[:-1].replace("jsonFlickrApi(","")
try:
retval = simplejson.loads(data)
except:
print data
retval = {}
return retval
photofeed = do_query("getList")
targetfolder = "flickrphotosrecent/"
if not os.path.isdir(targetfolder):
os.mkdir(targetfolder)
for i in range(100):
photo = random.choice(photofeed['photos']['photo'])
photosizes = do_query("getSizes",(("photo_id",photo['id']),))
photoUrl = photosizes['sizes']['size'][-1]['source']
filename = photoUrl.split('/')[-1]
command = "wget {0}".format(photoUrl)
os.chdir(targetfolder)
if os.path.isfile(filename):
print "NOT DOWNLOADING -- PHOTO ALREADY EXISTS"
else:
os.system(command)
os.chdir("..")