User:Silviolorusso/thematic2/googlescrapers: Difference between revisions
No edit summary |
|||
Line 108: | Line 108: | ||
== Google Image Search Scraper == | == Google Image Search Scraper == | ||
#!/home/slorusso/ENV/bin/python | |||
# coding: utf-8 | |||
import subprocess, re, urllib | |||
from BeautifulSoup import BeautifulSoup | |||
class GoogleImageSearchResults : | |||
AGENT_ID = "Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1" | |||
GOOGLE_URL = "https://www.google.com/images?q={0}&sout=1" | |||
_myGooglePage = "" | |||
def scrapeImages(self, theQuery) : | |||
self._myGooglePage = subprocess.check_output(["curl", "-L", "-A", self.AGENT_ID, self.GOOGLE_URL.format(urllib.quote(theQuery))], stderr=subprocess.STDOUT) | |||
def getImages(self) : | |||
mySoup = BeautifulSoup(self._myGooglePage) | |||
myResults = mySoup.find("table", {"class" : "images_table"}).findAll("td") | |||
for i in myResults : | |||
myPattern = re.compile(r'imgurl=(?P<imgurl>[^&]+)&') | |||
myImage = myPattern.findall(str(i))[0] | |||
print "myImage!" | |||
print myImage | |||
print | |||
myCaption = re.sub('<a .*?</a>','', str(i.renderContents())) | |||
myCaption = re.sub('<br />','', myCaption) | |||
print "myCaption!" | |||
print myCaption | |||
print | |||
return myImage | |||
def getImages2(self) : | |||
myPattern = re.compile(r'imgurl=(?P<imgurl>[^&]+)&') | |||
myImages = myPattern.findall(self._myGooglePage) | |||
return myImages | |||
# not working | |||
def getStats(self) : | |||
mySoup = BeautifulSoup(self._myGooglePage) | |||
myResultStats = mySoup.find("div", {"id" : "subform_ctrl"}) | |||
if str(myResultStats) == "None" : return "" | |||
return myResultStats | |||
def getFirstResult(self) : | |||
mySoup = BeautifulSoup(self._myGooglePage) | |||
myHeadline = mySoup.find("h3", {"class" : "r"}) | |||
myA = myHeadline.find("a") | |||
return myA["href"] |
Latest revision as of 18:09, 20 February 2012
Google Search scraper
#!/home/slorusso/ENV/bin/python
# coding: utf-8
import subprocess, re, urllib
from BeautifulSoup import BeautifulSoup
class GoogleSearchResults :
AGENT_ID = "Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"
GOOGLE_URL = "http://www.google.com/search?hl=en&q={0}&btnG=Google+Search"
_myGooglePage = ""
def scrape(self, theQuery) :
self._myGooglePage = subprocess.check_output(["curl", "-L", "-A", self.AGENT_ID, self.GOOGLE_URL.format(urllib.quote(theQuery))], stderr=subprocess.STDOUT)
def getResults(self) :
mySoup = BeautifulSoup(self._myGooglePage)
myResults = mySoup.findAll("li", {"class" : "g"})
myHtml = ""
for i in myResults :
myHeadline = str(i.find("h3", {"class" : "r"}))
myHeadline = re.sub('onmousedown\=\".*?\"','', myHeadline)
myHeadline = re.sub('class\=\".*?\"','', myHeadline)
# news exception
if not "News for" in myHeadline :
myHtml += myHeadline
myDetails = i.find("div", {"class" : "s"})
if not myDetails is None :
myDetailsStr = str(i.find("div", {"class" : "s"}))
myDetailsStr = re.sub('onmousedown\=\".*?\"','', myDetailsStr)
myDetailsStr = re.sub('<span class\=\"vshid\".*?</span>','', myDetailsStr)
#youtube exception (pay attention to news)
myDetailsStr = re.sub('<h3 .*?</h3>','', myDetailsStr)
youtubeTable = myDetails.findAll("td")
try :
youtubeContent = youtubeTable[2].div.text
myDetailsStr = re.sub('<table .*?</table>', str(youtubeContent), myDetailsStr)
#print youtubeContent
except : pass
myHtml += myDetailsStr
myTable = i.find("table", {"class" : "nrgt"})
if not myTable is None :
mySubResults = myTable.findAll("div", {"class" : "sld vsc"})
for j in mySubResults:
mySubHeadline = str(j.find("h3", {"class" : "r"}))
mySubHeadline = re.sub('onmousedown\=\".*?\"','', mySubHeadline)
mySubDetail = str(j.find("div", {"class" : "st"}))
mySubDetail = re.sub('onmousedown\=\".*?\"','', mySubDetail)
myHtml += mySubHeadline
myHtml += mySubDetail
myHtml += """
<br/>
"""
return myHtml
def getResultsOld(self) :
mySoup = BeautifulSoup(self._myGooglePage)
myHeadlines = mySoup.findAll("h3", {"class" : "r"})
myDetails = mySoup.findAll("div", {"class" : "s"})
myHtml = ""
myRange = min(len(myHeadlines), len(myDetails))
for i in range(myRange) :
myHeadline = str(myHeadlines[i]).replace('class="l"', "")
try :
myPattern = re.compile("onmousedown\=\".*?\"")
myHeadline = re.sub(myPattern, "", myHeadline)
except : pass
myHtml += myHeadline
myDetail = BeautifulSoup(str(myDetails[i]))
try :
mySpan = myDetail.find("span", {"class" : "vshid"})
mySpan.replaceWith("")
myDiv = myDetail.find("div", {"class" : "esc slp"})
myDiv.replaceWith("")
except : pass
myHtml += str(myDetail)
myHtml += "\n\n"
i += 1
if myHtml == "" :
myHtml = "<span style='font-size:17px'>Your search did not match any documents.</span>"
return myHtml
def getStats(self) :
mySoup = BeautifulSoup(self._myGooglePage)
myResultStats = mySoup.find("div", {"id" : "resultStats"})
if str(myResultStats) == "None" : return ""
return re.sub('\(.*?\)','', myResultStats.text)
def getFirstResult(self) :
mySoup = BeautifulSoup(self._myGooglePage)
myHeadline = mySoup.find("h3", {"class" : "r"})
myA = myHeadline.find("a")
return myA["href"]
#myGSR = GoogleSearchResults()
#myGSR.scrape("silviolorusso")
#myGSR.getResults()
Google Image Search Scraper
- !/home/slorusso/ENV/bin/python
- coding: utf-8
import subprocess, re, urllib from BeautifulSoup import BeautifulSoup
class GoogleImageSearchResults :
AGENT_ID = "Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"
GOOGLE_URL = "https://www.google.com/images?q={0}&sout=1"
_myGooglePage = ""
def scrapeImages(self, theQuery) : self._myGooglePage = subprocess.check_output(["curl", "-L", "-A", self.AGENT_ID, self.GOOGLE_URL.format(urllib.quote(theQuery))], stderr=subprocess.STDOUT)
def getImages(self) : mySoup = BeautifulSoup(self._myGooglePage) myResults = mySoup.find("table", {"class" : "images_table"}).findAll("td") for i in myResults : myPattern = re.compile(r'imgurl=(?P<imgurl>[^&]+)&') myImage = myPattern.findall(str(i))[0] print "myImage!" print myImage print myCaption = re.sub('<a .*?</a>',, str(i.renderContents())) myCaption = re.sub('
',, myCaption) print "myCaption!" print myCaption print return myImage def getImages2(self) : myPattern = re.compile(r'imgurl=(?P<imgurl>[^&]+)&') myImages = myPattern.findall(self._myGooglePage) return myImages
# not working def getStats(self) : mySoup = BeautifulSoup(self._myGooglePage) myResultStats = mySoup.find("div", {"id" : "subform_ctrl"}) if str(myResultStats) == "None" : return "" return myResultStats
def getFirstResult(self) : mySoup = BeautifulSoup(self._myGooglePage) myHeadline = mySoup.find("h3", {"class" : "r"}) myA = myHeadline.find("a") return myA["href"]