User:Eleanorg/2.1/Beautiful Soup scraping: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
Line 43: Line 43:


for i in range(1,5):
for i in range(1,5):
         print titleSoup[i].contents[0]  # uses the BS .contents method to grab the content out of the tit$
         print titleSoup[i].contents[0]  # uses the BS .contents method to grab the content out of the title tags
         print linkSoup[i].get('href')  # uses the BS .get() method to get the href. You can also use .ge$
         print linkSoup[i].get('href')  # uses the BS .get() method to get the href.
         print "\n"
         print "\n"
print """
print """

Revision as of 16:16, 25 October 2012

More fun with Beautiful Soup. Extracting the useful bits from webpages, so that I can save them to text files or other fun things.


Scraping & saving to a text file

#!/usr/bin/python
#-*- coding:utf-8 -*-

# tutorial on web scraping: http://www.youtube.com/watch?v=Ap_DlSrT-iE
# using urllib & BeautifulSoup



from urllib import urlopen
from BeautifulSoup import BeautifulSoup
import re


# grab the hufpo RSS feed
webpage = urlopen('http://feeds.huffingtonpost.com/huffingtonpost/raw_feed').read()

# parse it with Beautiful Soup to extract titles & links
soup = BeautifulSoup(webpage)
titleSoup = soup.findAll('title')      # this creates a list of all the titles
linkSoup = soup.findAll('link')        # this creates a list of all the links

# reproduce the titles & links found on our own page:

print "Content-Type: text/html"
print
print """
<!DOCTYPE html>
<html>
  <head>
    <title></title>
    <style type="text/css">
        img { position: absolute; top:100px; left:100px; opacity: 0.3; width:500px; }
  </style>

  </head>

<body>"""

for i in range(1,5):
        print titleSoup[i].contents[0]  # uses the BS .contents method to grab the content out of the title tags
        print linkSoup[i].get('href')   # uses the BS .get() method to get the href.
        print "\n"
print """
</body>
</html>"""

# now let's write those values to a text file:

f = open("scraped.txt", "w")
for i in range(1,5):
        f.write(str(titleSoup[i].contents[0]))
        f.write("\n\n")
#       f.write(str(linkSoup[i].get('href'))
f.close()