User:Silviolorusso/thematic2/googlescrapers

From XPUB & Lens-Based wiki
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Google Search scraper

#!/home/slorusso/ENV/bin/python
# coding: utf-8

import subprocess, re, urllib
from BeautifulSoup import BeautifulSoup

class GoogleSearchResults :

    AGENT_ID   = "Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"

    GOOGLE_URL = "http://www.google.com/search?hl=en&q={0}&btnG=Google+Search"

    _myGooglePage = ""

    def scrape(self, theQuery) :
        self._myGooglePage = subprocess.check_output(["curl", "-L", "-A", self.AGENT_ID, self.GOOGLE_URL.format(urllib.quote(theQuery))], stderr=subprocess.STDOUT)
        
    def getResults(self) : 
        mySoup = BeautifulSoup(self._myGooglePage)
        myResults = mySoup.findAll("li", {"class" : "g"})
        myHtml = ""
        for i in myResults :
            myHeadline = str(i.find("h3", {"class" : "r"}))
            myHeadline = re.sub('onmousedown\=\".*?\"','', myHeadline)
            myHeadline = re.sub('class\=\".*?\"','', myHeadline)
            # news exception
            if not "News for" in myHeadline :
                myHtml += myHeadline
            myDetails = i.find("div", {"class" : "s"})
            if not myDetails is None :                    
                myDetailsStr = str(i.find("div", {"class" : "s"}))
                myDetailsStr = re.sub('onmousedown\=\".*?\"','', myDetailsStr)
                myDetailsStr = re.sub('<span class\=\"vshid\".*?</span>','', myDetailsStr)
                #youtube exception (pay attention to news)
                myDetailsStr = re.sub('<h3 .*?</h3>','', myDetailsStr)                
                youtubeTable = myDetails.findAll("td")
                try : 
                  youtubeContent = youtubeTable[2].div.text
                  myDetailsStr = re.sub('<table .*?</table>', str(youtubeContent), myDetailsStr) 
                  #print youtubeContent
                except : pass
                myHtml += myDetailsStr
            myTable = i.find("table", {"class" : "nrgt"})
            if not myTable is None :
                mySubResults = myTable.findAll("div", {"class" : "sld vsc"})
                for j in mySubResults:
                    mySubHeadline = str(j.find("h3", {"class" : "r"}))
                    mySubHeadline = re.sub('onmousedown\=\".*?\"','', mySubHeadline)
                    mySubDetail = str(j.find("div", {"class" : "st"}))
                    mySubDetail = re.sub('onmousedown\=\".*?\"','', mySubDetail)
                    myHtml += mySubHeadline
                    myHtml += mySubDetail
            myHtml += """
            <br/>
            """
        return myHtml
       

    def getResultsOld(self) : 
        mySoup = BeautifulSoup(self._myGooglePage)
        myHeadlines = mySoup.findAll("h3", {"class" : "r"})
        myDetails = mySoup.findAll("div", {"class" : "s"})
        myHtml = ""
        myRange = min(len(myHeadlines), len(myDetails))
        for i in range(myRange) :
            myHeadline = str(myHeadlines[i]).replace('class="l"', "")
            try :
                myPattern = re.compile("onmousedown\=\".*?\"")
                myHeadline = re.sub(myPattern, "", myHeadline)
            except : pass
            myHtml += myHeadline
            myDetail = BeautifulSoup(str(myDetails[i]))
            try :
                mySpan = myDetail.find("span", {"class" : "vshid"})
                mySpan.replaceWith("")
                myDiv = myDetail.find("div", {"class" : "esc slp"})
                myDiv.replaceWith("")
            except : pass
            myHtml += str(myDetail)
            myHtml += "\n\n"
            i += 1
        if myHtml == "" :
            myHtml = "<span style='font-size:17px'>Your search did not match any documents.</span>"
        return myHtml

    def getStats(self) :
        mySoup = BeautifulSoup(self._myGooglePage)
        myResultStats = mySoup.find("div", {"id" : "resultStats"})
        if str(myResultStats) == "None" : return ""
        return re.sub('\(.*?\)','', myResultStats.text)

    def getFirstResult(self) :
        mySoup = BeautifulSoup(self._myGooglePage)
        myHeadline = mySoup.find("h3", {"class" : "r"})
        myA = myHeadline.find("a")
        return myA["href"]
        
#myGSR = GoogleSearchResults() 
#myGSR.scrape("silviolorusso")
#myGSR.getResults()

Google Image Search Scraper

  1. !/home/slorusso/ENV/bin/python
  2. coding: utf-8

import subprocess, re, urllib from BeautifulSoup import BeautifulSoup

class GoogleImageSearchResults :

   AGENT_ID   = "Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"
   GOOGLE_URL = "https://www.google.com/images?q={0}&sout=1"
   _myGooglePage = ""
   def scrapeImages(self, theQuery) :
       self._myGooglePage = subprocess.check_output(["curl", "-L", "-A", self.AGENT_ID, self.GOOGLE_URL.format(urllib.quote(theQuery))], stderr=subprocess.STDOUT)
   def getImages(self) :
       mySoup = BeautifulSoup(self._myGooglePage)
       myResults = mySoup.find("table", {"class" : "images_table"}).findAll("td")
       for i in myResults : 
           myPattern = re.compile(r'imgurl=(?P<imgurl>[^&]+)&')
           myImage = myPattern.findall(str(i))[0]
           print "myImage!"
           print myImage
           print
           myCaption = re.sub('<a .*?</a>',, str(i.renderContents()))
           myCaption = re.sub('
',, myCaption) print "myCaption!" print myCaption print return myImage def getImages2(self) : myPattern = re.compile(r'imgurl=(?P<imgurl>[^&]+)&') myImages = myPattern.findall(self._myGooglePage) return myImages
   # not working 
   def getStats(self) :
       mySoup = BeautifulSoup(self._myGooglePage)
       myResultStats = mySoup.find("div", {"id" : "subform_ctrl"})
       if str(myResultStats) == "None" : return ""
       return myResultStats
   def getFirstResult(self) :
       mySoup = BeautifulSoup(self._myGooglePage)
       myHeadline = mySoup.find("h3", {"class" : "r"})
       myA = myHeadline.find("a")
       return myA["href"]