User:Silviolorusso/thematic2/googlescrapers: Difference between revisions
(Created page with "== Google Search scraper == <source lang="python"> #!/home/slorusso/ENV/bin/python # coding: utf-8 import subprocess, re, urllib from BeautifulSoup import BeautifulSoup cla...") |
No edit summary |
||
Line 106: | Line 106: | ||
</source> | </source> | ||
== Google Image Search Scraper == |
Revision as of 18:09, 20 February 2012
Google Search scraper
#!/home/slorusso/ENV/bin/python
# coding: utf-8
import subprocess, re, urllib
from BeautifulSoup import BeautifulSoup
class GoogleSearchResults :
AGENT_ID = "Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"
GOOGLE_URL = "http://www.google.com/search?hl=en&q={0}&btnG=Google+Search"
_myGooglePage = ""
def scrape(self, theQuery) :
self._myGooglePage = subprocess.check_output(["curl", "-L", "-A", self.AGENT_ID, self.GOOGLE_URL.format(urllib.quote(theQuery))], stderr=subprocess.STDOUT)
def getResults(self) :
mySoup = BeautifulSoup(self._myGooglePage)
myResults = mySoup.findAll("li", {"class" : "g"})
myHtml = ""
for i in myResults :
myHeadline = str(i.find("h3", {"class" : "r"}))
myHeadline = re.sub('onmousedown\=\".*?\"','', myHeadline)
myHeadline = re.sub('class\=\".*?\"','', myHeadline)
# news exception
if not "News for" in myHeadline :
myHtml += myHeadline
myDetails = i.find("div", {"class" : "s"})
if not myDetails is None :
myDetailsStr = str(i.find("div", {"class" : "s"}))
myDetailsStr = re.sub('onmousedown\=\".*?\"','', myDetailsStr)
myDetailsStr = re.sub('<span class\=\"vshid\".*?</span>','', myDetailsStr)
#youtube exception (pay attention to news)
myDetailsStr = re.sub('<h3 .*?</h3>','', myDetailsStr)
youtubeTable = myDetails.findAll("td")
try :
youtubeContent = youtubeTable[2].div.text
myDetailsStr = re.sub('<table .*?</table>', str(youtubeContent), myDetailsStr)
#print youtubeContent
except : pass
myHtml += myDetailsStr
myTable = i.find("table", {"class" : "nrgt"})
if not myTable is None :
mySubResults = myTable.findAll("div", {"class" : "sld vsc"})
for j in mySubResults:
mySubHeadline = str(j.find("h3", {"class" : "r"}))
mySubHeadline = re.sub('onmousedown\=\".*?\"','', mySubHeadline)
mySubDetail = str(j.find("div", {"class" : "st"}))
mySubDetail = re.sub('onmousedown\=\".*?\"','', mySubDetail)
myHtml += mySubHeadline
myHtml += mySubDetail
myHtml += """
<br/>
"""
return myHtml
def getResultsOld(self) :
mySoup = BeautifulSoup(self._myGooglePage)
myHeadlines = mySoup.findAll("h3", {"class" : "r"})
myDetails = mySoup.findAll("div", {"class" : "s"})
myHtml = ""
myRange = min(len(myHeadlines), len(myDetails))
for i in range(myRange) :
myHeadline = str(myHeadlines[i]).replace('class="l"', "")
try :
myPattern = re.compile("onmousedown\=\".*?\"")
myHeadline = re.sub(myPattern, "", myHeadline)
except : pass
myHtml += myHeadline
myDetail = BeautifulSoup(str(myDetails[i]))
try :
mySpan = myDetail.find("span", {"class" : "vshid"})
mySpan.replaceWith("")
myDiv = myDetail.find("div", {"class" : "esc slp"})
myDiv.replaceWith("")
except : pass
myHtml += str(myDetail)
myHtml += "\n\n"
i += 1
if myHtml == "" :
myHtml = "<span style='font-size:17px'>Your search did not match any documents.</span>"
return myHtml
def getStats(self) :
mySoup = BeautifulSoup(self._myGooglePage)
myResultStats = mySoup.find("div", {"id" : "resultStats"})
if str(myResultStats) == "None" : return ""
return re.sub('\(.*?\)','', myResultStats.text)
def getFirstResult(self) :
mySoup = BeautifulSoup(self._myGooglePage)
myHeadline = mySoup.find("h3", {"class" : "r"})
myA = myHeadline.find("a")
return myA["href"]
#myGSR = GoogleSearchResults()
#myGSR.scrape("silviolorusso")
#myGSR.getResults()