User:Silviolorusso/thematic2/googlescrapers

From XPUB & Lens-Based wiki
< User:Silviolorusso
Revision as of 18:09, 20 February 2012 by Silviolorusso (talk | contribs) (→‎Google Image Search Scraper)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

Google Search scraper

#!/home/slorusso/ENV/bin/python
# coding: utf-8

import subprocess, re, urllib
from BeautifulSoup import BeautifulSoup

class GoogleSearchResults :

    AGENT_ID   = "Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"

    GOOGLE_URL = "http://www.google.com/search?hl=en&q={0}&btnG=Google+Search"

    _myGooglePage = ""

    def scrape(self, theQuery) :
        self._myGooglePage = subprocess.check_output(["curl", "-L", "-A", self.AGENT_ID, self.GOOGLE_URL.format(urllib.quote(theQuery))], stderr=subprocess.STDOUT)
        
    def getResults(self) : 
        mySoup = BeautifulSoup(self._myGooglePage)
        myResults = mySoup.findAll("li", {"class" : "g"})
        myHtml = ""
        for i in myResults :
            myHeadline = str(i.find("h3", {"class" : "r"}))
            myHeadline = re.sub('onmousedown\=\".*?\"','', myHeadline)
            myHeadline = re.sub('class\=\".*?\"','', myHeadline)
            # news exception
            if not "News for" in myHeadline :
                myHtml += myHeadline
            myDetails = i.find("div", {"class" : "s"})
            if not myDetails is None :                    
                myDetailsStr = str(i.find("div", {"class" : "s"}))
                myDetailsStr = re.sub('onmousedown\=\".*?\"','', myDetailsStr)
                myDetailsStr = re.sub('<span class\=\"vshid\".*?</span>','', myDetailsStr)
                #youtube exception (pay attention to news)
                myDetailsStr = re.sub('<h3 .*?</h3>','', myDetailsStr)                
                youtubeTable = myDetails.findAll("td")
                try : 
                  youtubeContent = youtubeTable[2].div.text
                  myDetailsStr = re.sub('<table .*?</table>', str(youtubeContent), myDetailsStr) 
                  #print youtubeContent
                except : pass
                myHtml += myDetailsStr
            myTable = i.find("table", {"class" : "nrgt"})
            if not myTable is None :
                mySubResults = myTable.findAll("div", {"class" : "sld vsc"})
                for j in mySubResults:
                    mySubHeadline = str(j.find("h3", {"class" : "r"}))
                    mySubHeadline = re.sub('onmousedown\=\".*?\"','', mySubHeadline)
                    mySubDetail = str(j.find("div", {"class" : "st"}))
                    mySubDetail = re.sub('onmousedown\=\".*?\"','', mySubDetail)
                    myHtml += mySubHeadline
                    myHtml += mySubDetail
            myHtml += """
            <br/>
            """
        return myHtml
       

    def getResultsOld(self) : 
        mySoup = BeautifulSoup(self._myGooglePage)
        myHeadlines = mySoup.findAll("h3", {"class" : "r"})
        myDetails = mySoup.findAll("div", {"class" : "s"})
        myHtml = ""
        myRange = min(len(myHeadlines), len(myDetails))
        for i in range(myRange) :
            myHeadline = str(myHeadlines[i]).replace('class="l"', "")
            try :
                myPattern = re.compile("onmousedown\=\".*?\"")
                myHeadline = re.sub(myPattern, "", myHeadline)
            except : pass
            myHtml += myHeadline
            myDetail = BeautifulSoup(str(myDetails[i]))
            try :
                mySpan = myDetail.find("span", {"class" : "vshid"})
                mySpan.replaceWith("")
                myDiv = myDetail.find("div", {"class" : "esc slp"})
                myDiv.replaceWith("")
            except : pass
            myHtml += str(myDetail)
            myHtml += "\n\n"
            i += 1
        if myHtml == "" :
            myHtml = "<span style='font-size:17px'>Your search did not match any documents.</span>"
        return myHtml

    def getStats(self) :
        mySoup = BeautifulSoup(self._myGooglePage)
        myResultStats = mySoup.find("div", {"id" : "resultStats"})
        if str(myResultStats) == "None" : return ""
        return re.sub('\(.*?\)','', myResultStats.text)

    def getFirstResult(self) :
        mySoup = BeautifulSoup(self._myGooglePage)
        myHeadline = mySoup.find("h3", {"class" : "r"})
        myA = myHeadline.find("a")
        return myA["href"]
        
#myGSR = GoogleSearchResults() 
#myGSR.scrape("silviolorusso")
#myGSR.getResults()

Google Image Search Scraper

  1. !/home/slorusso/ENV/bin/python
  2. coding: utf-8

import subprocess, re, urllib from BeautifulSoup import BeautifulSoup

class GoogleImageSearchResults :

   AGENT_ID   = "Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"
   GOOGLE_URL = "https://www.google.com/images?q={0}&sout=1"
   _myGooglePage = ""
   def scrapeImages(self, theQuery) :
       self._myGooglePage = subprocess.check_output(["curl", "-L", "-A", self.AGENT_ID, self.GOOGLE_URL.format(urllib.quote(theQuery))], stderr=subprocess.STDOUT)
   def getImages(self) :
       mySoup = BeautifulSoup(self._myGooglePage)
       myResults = mySoup.find("table", {"class" : "images_table"}).findAll("td")
       for i in myResults : 
           myPattern = re.compile(r'imgurl=(?P<imgurl>[^&]+)&')
           myImage = myPattern.findall(str(i))[0]
           print "myImage!"
           print myImage
           print
           myCaption = re.sub('<a .*?</a>',, str(i.renderContents()))
           myCaption = re.sub('
',, myCaption) print "myCaption!" print myCaption print return myImage def getImages2(self) : myPattern = re.compile(r'imgurl=(?P<imgurl>[^&]+)&') myImages = myPattern.findall(self._myGooglePage) return myImages
   # not working 
   def getStats(self) :
       mySoup = BeautifulSoup(self._myGooglePage)
       myResultStats = mySoup.find("div", {"id" : "subform_ctrl"})
       if str(myResultStats) == "None" : return ""
       return myResultStats
   def getFirstResult(self) :
       mySoup = BeautifulSoup(self._myGooglePage)
       myHeadline = mySoup.find("h3", {"class" : "r"})
       myA = myHeadline.find("a")
       return myA["href"]