User:Eleanorg/2.1/Beautiful Soup scraping: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
No edit summary
 
(One intermediate revision by the same user not shown)
Line 43: Line 43:


for i in range(1,5):
for i in range(1,5):
         print titleSoup[i].contents[0]  # uses the BS .contents method to grab the content out of the tit$
         print titleSoup[i].contents[0]  # uses the BS .contents method to grab the content out of the title tags
         print linkSoup[i].get('href')  # uses the BS .get() method to get the href. You can also use .ge$
         print linkSoup[i].get('href')  # uses the BS .get() method to get the href.
         print "\n"
         print "\n"
print """
print """
Line 58: Line 58:
#      f.write(str(linkSoup[i].get('href'))
#      f.write(str(linkSoup[i].get('href'))
f.close()
f.close()
</source>
==Accepting input from the user==
This html form lets the user input a URL. The associated script attempts to scrape it for anything useful, and saves this text to a text file as well as printing it onscreen.
<source lang="html4strict">
<!DOCTYPE html>
<html>
  <head>
    <title>make a scraped pdf!</title>
        <link <link rel="stylesheet" href="">
        <link href='http://fonts.googleapis.com/css?family=Josefin+Sans' rel='stylesheet' type='text/css'>
  </head>
  <body>
Make a text file out of a scraped URL!
<form action="../cgi-bin/urllib/makeScrapedText.cgi" name="inputForm"> 
  URL: <input name="urlInput" class="">
  <br />
  <input type="submit" value="Submit">
</form>
</body>
</html>
</source>
<source lang="python">
#!/usr/bin/python
#-*- coding:utf-8 -*-
# tutorial on web scraping: http://www.youtube.com/watch?v=Ap_DlSrT-iE
# using urllib & BeautifulSoup
import cgi
import cgitb; cgitb.enable()
from urllib import urlopen
from BeautifulSoup import BeautifulSoup
import re
# get the URL from the input form
form = cgi.FieldStorage()                      # Grabs whatever input comes from form
#TODO check if valid url/protocol given. urllib2 bit breaks if not.
url = form.getvalue("urlInput", "http://eleanorg.org")
#url = "http://eleanorg.org"
# grab the hufpo RSS feed
webpage = urlopen(url).read()
# parse it with Beautiful Soup to extract p tags
soup = BeautifulSoup(webpage)
pSoup = soup.findAll('p')      # this creates a list of all the titles
# print message to user in the browser
htmlHeader = """<!DOCTYPE html>
<html>
  <head>
    <title>A form talking to a python script</title>
  </head>
  <body>"""
htmlFooter = """
    </body>
</html>"""
print "Content-Type: text/html"
print
print htmlHeader
f = open("scraped.txt", "w")
for item in pSoup:
        howMany = len(item.contents)
        for i in range(0,howMany):
                text = item.contents[i].string
                if text:
                        print text
                        f.write(text)
                else:
                        print "Sorry, no paragraph content found at that URL."
        print "\n"
f.close()
print htmlFooter






</source>
</source>

Latest revision as of 15:39, 26 October 2012

More fun with Beautiful Soup. Extracting the useful bits from webpages, so that I can save them to text files or other fun things.


Scraping & saving to a text file

#!/usr/bin/python
#-*- coding:utf-8 -*-

# tutorial on web scraping: http://www.youtube.com/watch?v=Ap_DlSrT-iE
# using urllib & BeautifulSoup



from urllib import urlopen
from BeautifulSoup import BeautifulSoup
import re


# grab the hufpo RSS feed
webpage = urlopen('http://feeds.huffingtonpost.com/huffingtonpost/raw_feed').read()

# parse it with Beautiful Soup to extract titles & links
soup = BeautifulSoup(webpage)
titleSoup = soup.findAll('title')      # this creates a list of all the titles
linkSoup = soup.findAll('link')        # this creates a list of all the links

# reproduce the titles & links found on our own page:

print "Content-Type: text/html"
print
print """
<!DOCTYPE html>
<html>
  <head>
    <title></title>
    <style type="text/css">
        img { position: absolute; top:100px; left:100px; opacity: 0.3; width:500px; }
  </style>

  </head>

<body>"""

for i in range(1,5):
        print titleSoup[i].contents[0]  # uses the BS .contents method to grab the content out of the title tags
        print linkSoup[i].get('href')   # uses the BS .get() method to get the href.
        print "\n"
print """
</body>
</html>"""

# now let's write those values to a text file:

f = open("scraped.txt", "w")
for i in range(1,5):
        f.write(str(titleSoup[i].contents[0]))
        f.write("\n\n")
#       f.write(str(linkSoup[i].get('href'))
f.close()

Accepting input from the user

This html form lets the user input a URL. The associated script attempts to scrape it for anything useful, and saves this text to a text file as well as printing it onscreen.

<!DOCTYPE html>
<html>
  <head>
    <title>make a scraped pdf!</title>
        <link <link rel="stylesheet" href="">
        <link href='http://fonts.googleapis.com/css?family=Josefin+Sans' rel='stylesheet' type='text/css'>
  </head>
  <body>

Make a text file out of a scraped URL!
<form action="../cgi-bin/urllib/makeScrapedText.cgi" name="inputForm">  
   URL: <input name="urlInput" class="">
   <br />
   <input type="submit" value="Submit">
</form>

</body>
</html>
#!/usr/bin/python
#-*- coding:utf-8 -*-

# tutorial on web scraping: http://www.youtube.com/watch?v=Ap_DlSrT-iE
# using urllib & BeautifulSoup

import cgi
import cgitb; cgitb.enable()

from urllib import urlopen
from BeautifulSoup import BeautifulSoup
import re


# get the URL from the input form
form = cgi.FieldStorage()                       # Grabs whatever input comes from form
#TODO check if valid url/protocol given. urllib2 bit breaks if not.
url = form.getvalue("urlInput", "http://eleanorg.org")
#url = "http://eleanorg.org"
# grab the hufpo RSS feed
webpage = urlopen(url).read()

# parse it with Beautiful Soup to extract p tags
soup = BeautifulSoup(webpage)
pSoup = soup.findAll('p')      # this creates a list of all the titles


# print message to user in the browser

htmlHeader = """<!DOCTYPE html>
<html>
  <head>
    <title>A form talking to a python script</title>
  </head>
  <body>"""

htmlFooter = """
    </body>
</html>"""

print "Content-Type: text/html"
print
print htmlHeader

f = open("scraped.txt", "w")

for item in pSoup:
        howMany = len(item.contents)
        for i in range(0,howMany):
                text = item.contents[i].string
                if text:
                        print text
                        f.write(text)
                else:
                        print "Sorry, no paragraph content found at that URL."
        print "\n"
f.close()
print htmlFooter