User:Sebastians/prototyping/googleSearchByImage
< User:Sebastians
Revision as of 10:38, 27 October 2011 by Sebastians (talk | contribs)
Some first simple tries:
from GoogleImageSearch import GoogleImageSearch
from ImageDownloader import ImageDownloader
mySearch = GoogleImageSearch()
myImageDownloader = ImageDownloader("downloads-fk", 30, 1)
myImageUrl = "<IMAGE URL>"
myImageDownloader.download(myImageUrl, "0000")
i = 1
while i < 500:
mySimilarImages = mySearch.getSimilarImages(myImageUrl)
myDownloadSuccess = 0
myImageIndex = 0
while myDownloadSuccess == 0 :
try :
myImageUrlBefore = myImageUrl
myImageUrl = mySimilarImages[myImageIndex]
myOldImageUrl = myImageUrlBefore
myDownloadSuccess = myImageDownloader.download(myImageUrl, str('%04d' % i))
if myDownloadSuccess == 1 :
i += 1
else :
myImageIndex += 1
if myImageIndex == len(mySimilarImages) :
myDownloadSuccess = 1
myImageUrl = myOldImageUrl
except :
myDownloadSuccess = 1
myImageUrl = myOldImageUrl
import os, re, subprocess, time
class GoogleImageSearch :
GOOGLE_URL = "http://www.google.com/searchbyimage?sbisrc=cr_1_0_0&image_url="
AGENT_ID = "Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"
MIN_SECONDS_BETWEEN_REQUESTS = 3
_myLastRequestTimestamp = 0
def getSimilarImages(self, theReference) :
if time.time() - self._myLastRequestTimestamp < self.MIN_SECONDS_BETWEEN_REQUESTS :
time.sleep(self.MIN_SECONDS_BETWEEN_REQUESTS - (time.time() - self._myLastRequestTimestamp))
return self.getSimilarImages(theReference)
else :
myHtml = self.getHtml(self.GOOGLE_URL + theReference)
self._myLastRequestTimestamp = time.time()
return self.getImages(myHtml)
def getHtml(self, theUrl) :
myHtml = subprocess.check_output(["curl", "-L", "-A", self.AGENT_ID, theUrl], stderr=subprocess.STDOUT)
return myHtml
def getImages(self, theHtml) :
myPattern = re.compile("\" href\=\"\/imgres\?imgurl\=(.*?)(\&|\%3F)")
myImages = myPattern.findall(theHtml)
# this needs to go away
myImagesUrls = []
for myImage in myImages :
myImagesUrls.append(myImage[0])
#
return myImagesUrls
from urllib import urlretrieve
import urllib2, socket
class ImageDownloader :
ALLOWED_EXTENSIONS = [".gif", ".png", ".jpg", ".jpeg"]
_myDestinationFolder = ""
_myKeepTrackOfDownloads = 0
_myDownloads = []
def __init__(self, theDestinationFolder, theTimeout, theKeepTrackOfDownloads) :
socket.setdefaulttimeout(theTimeout)
self._myDestinationFolder = theDestinationFolder + "/"
self._myKeepTrackOfDownloads = theKeepTrackOfDownloads
def download(self, theImageUrl, theNewName) :
myImageName = self.getImageName(theImageUrl)
myNewImageName = myImageName["full"] if theNewName == "" else theNewName + myImageName["extension"]
if self.checkForValidDownload(theImageUrl, myImageName) == 0 : return 0
try :
resp = urllib2.urlopen(theImageUrl)
try : urlretrieve(theImageUrl, self._myDestinationFolder + myNewImageName)
except socket.timeout : return 0
print "Downloaded " + theImageUrl
if self._myKeepTrackOfDownloads : self._myDownloads.append(theImageUrl)
return 1
except : return 0
def getImageName(self, theImageUrl) :
myPathBits = theImageUrl.split("/")
myFileBits = myPathBits[len(myPathBits)-1].split(".")
if len(myFileBits) < 2 : myFileBits.append("noextension")
myImageName = {"name" : myFileBits[0], "extension" : "." + myFileBits[1], "full" : myFileBits[0] + "." + myFileBits[1]}
return myImageName
def checkForValidDownload(self, theImageUrl, theImageName) :
# has this image been downloaded already before?
if self._myKeepTrackOfDownloads :
try :
self._myDownloads.index(theImageUrl)
return 0
except : pass
# does the file have a "good" extension?
try :
self.ALLOWED_EXTENSIONS.index(theImageName["extension"].lower())
return 1
except : return 0