User:Silviolorusso/thematic2/googlescrapers: Difference between revisions

From XPUB & Lens-Based wiki
(Created page with "== Google Search scraper == <source lang="python"> #!/home/slorusso/ENV/bin/python # coding: utf-8 import subprocess, re, urllib from BeautifulSoup import BeautifulSoup cla...")
 
No edit summary
Line 106: Line 106:


</source>
</source>
== Google Image Search Scraper ==

Revision as of 18:09, 20 February 2012

Google Search scraper

#!/home/slorusso/ENV/bin/python
# coding: utf-8

import subprocess, re, urllib
from BeautifulSoup import BeautifulSoup

class GoogleSearchResults :

    AGENT_ID   = "Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"

    GOOGLE_URL = "http://www.google.com/search?hl=en&q={0}&btnG=Google+Search"

    _myGooglePage = ""

    def scrape(self, theQuery) :
        self._myGooglePage = subprocess.check_output(["curl", "-L", "-A", self.AGENT_ID, self.GOOGLE_URL.format(urllib.quote(theQuery))], stderr=subprocess.STDOUT)
        
    def getResults(self) : 
        mySoup = BeautifulSoup(self._myGooglePage)
        myResults = mySoup.findAll("li", {"class" : "g"})
        myHtml = ""
        for i in myResults :
            myHeadline = str(i.find("h3", {"class" : "r"}))
            myHeadline = re.sub('onmousedown\=\".*?\"','', myHeadline)
            myHeadline = re.sub('class\=\".*?\"','', myHeadline)
            # news exception
            if not "News for" in myHeadline :
                myHtml += myHeadline
            myDetails = i.find("div", {"class" : "s"})
            if not myDetails is None :                    
                myDetailsStr = str(i.find("div", {"class" : "s"}))
                myDetailsStr = re.sub('onmousedown\=\".*?\"','', myDetailsStr)
                myDetailsStr = re.sub('<span class\=\"vshid\".*?</span>','', myDetailsStr)
                #youtube exception (pay attention to news)
                myDetailsStr = re.sub('<h3 .*?</h3>','', myDetailsStr)                
                youtubeTable = myDetails.findAll("td")
                try : 
                  youtubeContent = youtubeTable[2].div.text
                  myDetailsStr = re.sub('<table .*?</table>', str(youtubeContent), myDetailsStr) 
                  #print youtubeContent
                except : pass
                myHtml += myDetailsStr
            myTable = i.find("table", {"class" : "nrgt"})
            if not myTable is None :
                mySubResults = myTable.findAll("div", {"class" : "sld vsc"})
                for j in mySubResults:
                    mySubHeadline = str(j.find("h3", {"class" : "r"}))
                    mySubHeadline = re.sub('onmousedown\=\".*?\"','', mySubHeadline)
                    mySubDetail = str(j.find("div", {"class" : "st"}))
                    mySubDetail = re.sub('onmousedown\=\".*?\"','', mySubDetail)
                    myHtml += mySubHeadline
                    myHtml += mySubDetail
            myHtml += """
            <br/>
            """
        return myHtml
       

    def getResultsOld(self) : 
        mySoup = BeautifulSoup(self._myGooglePage)
        myHeadlines = mySoup.findAll("h3", {"class" : "r"})
        myDetails = mySoup.findAll("div", {"class" : "s"})
        myHtml = ""
        myRange = min(len(myHeadlines), len(myDetails))
        for i in range(myRange) :
            myHeadline = str(myHeadlines[i]).replace('class="l"', "")
            try :
                myPattern = re.compile("onmousedown\=\".*?\"")
                myHeadline = re.sub(myPattern, "", myHeadline)
            except : pass
            myHtml += myHeadline
            myDetail = BeautifulSoup(str(myDetails[i]))
            try :
                mySpan = myDetail.find("span", {"class" : "vshid"})
                mySpan.replaceWith("")
                myDiv = myDetail.find("div", {"class" : "esc slp"})
                myDiv.replaceWith("")
            except : pass
            myHtml += str(myDetail)
            myHtml += "\n\n"
            i += 1
        if myHtml == "" :
            myHtml = "<span style='font-size:17px'>Your search did not match any documents.</span>"
        return myHtml

    def getStats(self) :
        mySoup = BeautifulSoup(self._myGooglePage)
        myResultStats = mySoup.find("div", {"id" : "resultStats"})
        if str(myResultStats) == "None" : return ""
        return re.sub('\(.*?\)','', myResultStats.text)

    def getFirstResult(self) :
        mySoup = BeautifulSoup(self._myGooglePage)
        myHeadline = mySoup.find("h3", {"class" : "r"})
        myA = myHeadline.find("a")
        return myA["href"]
        
#myGSR = GoogleSearchResults() 
#myGSR.scrape("silviolorusso")
#myGSR.getResults()

Google Image Search Scraper