User:Eleanorg/2.1/Beautiful Soup scraping
More fun with Beautiful Soup. Extracting the useful bits from webpages, so that I can save them to text files or other fun things.
Scraping & saving to a text file
#!/usr/bin/python
#-*- coding:utf-8 -*-
# tutorial on web scraping: http://www.youtube.com/watch?v=Ap_DlSrT-iE
# using urllib & BeautifulSoup
from urllib import urlopen
from BeautifulSoup import BeautifulSoup
import re
# grab the hufpo RSS feed
webpage = urlopen('http://feeds.huffingtonpost.com/huffingtonpost/raw_feed').read()
# parse it with Beautiful Soup to extract titles & links
soup = BeautifulSoup(webpage)
titleSoup = soup.findAll('title') # this creates a list of all the titles
linkSoup = soup.findAll('link') # this creates a list of all the links
# reproduce the titles & links found on our own page:
print "Content-Type: text/html"
print
print """
<!DOCTYPE html>
<html>
<head>
<title></title>
<style type="text/css">
img { position: absolute; top:100px; left:100px; opacity: 0.3; width:500px; }
</style>
</head>
<body>"""
for i in range(1,5):
print titleSoup[i].contents[0] # uses the BS .contents method to grab the content out of the title tags
print linkSoup[i].get('href') # uses the BS .get() method to get the href.
print "\n"
print """
</body>
</html>"""
# now let's write those values to a text file:
f = open("scraped.txt", "w")
for i in range(1,5):
f.write(str(titleSoup[i].contents[0]))
f.write("\n\n")
# f.write(str(linkSoup[i].get('href'))
f.close()
Accepting input from the user
This html form lets the user input a URL. The associated script attempts to scrape it for anything useful, and saves this text to a text file as well as printing it onscreen.
<!DOCTYPE html>
<html>
<head>
<title>make a scraped pdf!</title>
<link <link rel="stylesheet" href="">
<link href='http://fonts.googleapis.com/css?family=Josefin+Sans' rel='stylesheet' type='text/css'>
</head>
<body>
Make a text file out of a scraped URL!
<form action="../cgi-bin/urllib/makeScrapedText.cgi" name="inputForm">
URL: <input name="urlInput" class="">
<br />
<input type="submit" value="Submit">
</form>
</body>
</html>
#!/usr/bin/python
#-*- coding:utf-8 -*-
# tutorial on web scraping: http://www.youtube.com/watch?v=Ap_DlSrT-iE
# using urllib & BeautifulSoup
import cgi
import cgitb; cgitb.enable()
from urllib import urlopen
from BeautifulSoup import BeautifulSoup
import re
# get the URL from the input form
form = cgi.FieldStorage() # Grabs whatever input comes from form
#TODO check if valid url/protocol given. urllib2 bit breaks if not.
url = form.getvalue("urlInput", "http://eleanorg.org")
#url = "http://eleanorg.org"
# grab the hufpo RSS feed
webpage = urlopen(url).read()
# parse it with Beautiful Soup to extract p tags
soup = BeautifulSoup(webpage)
pSoup = soup.findAll('p') # this creates a list of all the titles
# print message to user in the browser
htmlHeader = """<!DOCTYPE html>
<html>
<head>
<title>A form talking to a python script</title>
</head>
<body>"""
htmlFooter = """
</body>
</html>"""
print "Content-Type: text/html"
print
print htmlHeader
f = open("scraped.txt", "w")
for item in pSoup:
howMany = len(item.contents)
for i in range(0,howMany):
text = item.contents[i].string
if text:
print text
f.write(text)
else:
print "Sorry, no paragraph content found at that URL."
print "\n"
f.close()
print htmlFooter