User:Sebastians/prototyping/googleSearchByImage

Some first simple tries:

from GoogleImageSearch import GoogleImageSearch
from ImageDownloader import ImageDownloader


mySearch = GoogleImageSearch()
myImageDownloader = ImageDownloader("downloads-fk", 30, 1)

myImageUrl = "<IMAGE URL>"

myImageDownloader.download(myImageUrl, "0000")

i = 1
while i < 500:
    mySimilarImages = mySearch.getSimilarImages(myImageUrl)
    myDownloadSuccess = 0
    myImageIndex = 0
    while myDownloadSuccess == 0 :
        try :
            myImageUrlBefore = myImageUrl
            myImageUrl = mySimilarImages[myImageIndex]
            myOldImageUrl = myImageUrlBefore
            myDownloadSuccess = myImageDownloader.download(myImageUrl, str('%04d' % i))            
            if myDownloadSuccess == 1 :
                i += 1
            else :
                myImageIndex += 1
                if myImageIndex == len(mySimilarImages) :
                    myDownloadSuccess = 1
                    myImageUrl = myOldImageUrl    
        except :
            myDownloadSuccess = 1
            myImageUrl = myOldImageUrl

import os, re, subprocess, time

class GoogleImageSearch :

    GOOGLE_URL = "http://www.google.com/searchbyimage?sbisrc=cr_1_0_0&image_url="
    
    AGENT_ID   = "Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"

    MIN_SECONDS_BETWEEN_REQUESTS = 3

    _myLastRequestTimestamp = 0
    
    def getSimilarImages(self, theReference) :
        if time.time() - self._myLastRequestTimestamp < self.MIN_SECONDS_BETWEEN_REQUESTS :
            time.sleep(self.MIN_SECONDS_BETWEEN_REQUESTS - (time.time() - self._myLastRequestTimestamp))
            return self.getSimilarImages(theReference)
        else :
            myHtml = self.getHtml(self.GOOGLE_URL + theReference)
            self._myLastRequestTimestamp = time.time()
            return self.getImages(myHtml)

    def getHtml(self, theUrl) : 
        myHtml = subprocess.check_output(["curl", "-L", "-A", self.AGENT_ID, theUrl], stderr=subprocess.STDOUT)
        return myHtml

    def getImages(self, theHtml) :
        myPattern = re.compile("\" href\=\"\/imgres\?imgurl\=(.*?)(\&amp|\%3F)")        
        myImages = myPattern.findall(theHtml)
        # this needs to go away
        myImagesUrls = []        
        for myImage in myImages :
            myImagesUrls.append(myImage[0])
        #
        return myImagesUrls

from urllib import urlretrieve
import urllib2, socket


class ImageDownloader :

    ALLOWED_EXTENSIONS = [".gif", ".png", ".jpg", ".jpeg"]

    _myDestinationFolder = ""

    _myKeepTrackOfDownloads = 0

    _myDownloads = []

    def __init__(self, theDestinationFolder, theTimeout, theKeepTrackOfDownloads) :
        socket.setdefaulttimeout(theTimeout)
        self._myDestinationFolder = theDestinationFolder + "/"
        self._myKeepTrackOfDownloads = theKeepTrackOfDownloads

    def download(self, theImageUrl, theNewName) :
        myImageName = self.getImageName(theImageUrl)
        myNewImageName = myImageName["full"] if theNewName == "" else theNewName + myImageName["extension"]
        if self.checkForValidDownload(theImageUrl, myImageName) == 0 : return 0
        try :
            resp = urllib2.urlopen(theImageUrl)
            try : urlretrieve(theImageUrl, self._myDestinationFolder + myNewImageName)
            except socket.timeout : return 0            
            print "Downloaded " + theImageUrl
            if self._myKeepTrackOfDownloads : self._myDownloads.append(theImageUrl)
            return 1
        except : return 0

    def getImageName(self, theImageUrl) :
        myPathBits = theImageUrl.split("/")
        myFileBits = myPathBits[len(myPathBits)-1].split(".")
        if len(myFileBits) < 2 : myFileBits.append("noextension")
        myImageName = {"name" : myFileBits[0], "extension" : "." + myFileBits[1], "full" : myFileBits[0] + "." + myFileBits[1]}
        return myImageName

    def checkForValidDownload(self, theImageUrl, theImageName) :
        # has this image been downloaded already before?
        if self._myKeepTrackOfDownloads :
            try :
                self._myDownloads.index(theImageUrl)
                return 0     
            except : pass
        # does the file have a "good" extension?
        try :
            self.ALLOWED_EXTENSIONS.index(theImageName["extension"].lower())
            return 1
        except : return 0