User:Sebastians/prototyping/googleSearchByImage
< User:Sebastians
Revision as of 09:25, 27 October 2011 by Sebastians (talk | contribs) (Created page with " <source lang="python"> import os, re, subprocess, time class GoogleImageSearch : GOOGLE_URL = "http://www.google.com/searchbyimage?sbisrc=cr_1_0_0&image_url=" AGE...")
import os, re, subprocess, time
class GoogleImageSearch :
GOOGLE_URL = "http://www.google.com/searchbyimage?sbisrc=cr_1_0_0&image_url="
AGENT_ID = "Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"
MIN_SECONDS_BETWEEN_REQUESTS = 3
_myLastRequestTimestamp = 0
def getSimilarImages(self, theReference) :
if time.time() - self._myLastRequestTimestamp < self.MIN_SECONDS_BETWEEN_REQUESTS :
time.sleep(self.MIN_SECONDS_BETWEEN_REQUESTS - (time.time() - self._myLastRequestTimestamp))
return self.getSimilarImages(theReference)
else :
myHtml = self.getHtml(self.GOOGLE_URL + theReference)
self._myLastRequestTimestamp = time.time()
return self.getImages(myHtml)
def getHtml(self, theUrl) :
myHtml = subprocess.check_output(["curl", "-L", "-A", self.AGENT_ID, theUrl], stderr=subprocess.STDOUT)
return myHtml
def getImages(self, theHtml) :
myPattern = re.compile("\" href\=\"\/imgres\?imgurl\=(.*?)(\&|\%3F)")
myImages = myPattern.findall(theHtml)
# this needs to go away
myImagesUrls = []
for myImage in myImages :
myImagesUrls.append(myImage[0])
#
return myImagesUrls
from urllib import urlretrieve
import urllib2, socket
class ImageDownloader :
ALLOWED_EXTENSIONS = [".gif", ".png", ".jpg", ".jpeg"]
_myDestinationFolder = ""
_myKeepTrackOfDownloads = 0
_myDownloads = []
def __init__(self, theDestinationFolder, theTimeout, theKeepTrackOfDownloads) :
socket.setdefaulttimeout(theTimeout)
self._myDestinationFolder = theDestinationFolder + "/"
self._myKeepTrackOfDownloads = theKeepTrackOfDownloads
def download(self, theImageUrl, theNewName) :
myImageName = self.getImageName(theImageUrl)
myNewImageName = myImageName["full"] if theNewName == "" else theNewName + myImageName["extension"]
if self.checkForValidDownload(theImageUrl, myImageName) == 0 : return 0
try :
resp = urllib2.urlopen(theImageUrl)
try : urlretrieve(theImageUrl, self._myDestinationFolder + myNewImageName)
except socket.timeout : return 0
print "Downloaded " + theImageUrl
if self._myKeepTrackOfDownloads : self._myDownloads.append(theImageUrl)
return 1
except : return 0
def getImageName(self, theImageUrl) :
myPathBits = theImageUrl.split("/")
myFileBits = myPathBits[len(myPathBits)-1].split(".")
if len(myFileBits) < 2 : myFileBits.append("noextension")
myImageName = {"name" : myFileBits[0], "extension" : "." + myFileBits[1], "full" : myFileBits[0] + "." + myFileBits[1]}
return myImageName
def checkForValidDownload(self, theImageUrl, theImageName) :
# has this image been downloaded already before?
if self._myKeepTrackOfDownloads :
try :
self._myDownloads.index(theImageUrl)
return 0
except : pass
# does the file have a "good" extension?
try :
self.ALLOWED_EXTENSIONS.index(theImageName["extension"].lower())
return 1
except : return 0