User:Eleanorg/2.1/Beautiful Soup scraping: Difference between revisions
(Created page with "More fun with Beautiful Soup. Extracting the useful bits from webpages, so that I can save them to text files or other useful things. ==Scraping & saving to a text file== <sour...") |
No edit summary |
||
(2 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
More fun with Beautiful Soup. Extracting the useful bits from webpages, so that I can save them to text files or other | More fun with Beautiful Soup. Extracting the useful bits from webpages, so that I can save them to text files or other fun things. | ||
Line 43: | Line 43: | ||
for i in range(1,5): | for i in range(1,5): | ||
print titleSoup[i].contents[0] # uses the BS .contents method to grab the content out of the | print titleSoup[i].contents[0] # uses the BS .contents method to grab the content out of the title tags | ||
print linkSoup[i].get('href') # uses the BS .get() method to get the href. | print linkSoup[i].get('href') # uses the BS .get() method to get the href. | ||
print "\n" | print "\n" | ||
print """ | print """ | ||
Line 58: | Line 58: | ||
# f.write(str(linkSoup[i].get('href')) | # f.write(str(linkSoup[i].get('href')) | ||
f.close() | f.close() | ||
</source> | |||
==Accepting input from the user== | |||
This html form lets the user input a URL. The associated script attempts to scrape it for anything useful, and saves this text to a text file as well as printing it onscreen. | |||
<source lang="html4strict"> | |||
<!DOCTYPE html> | |||
<html> | |||
<head> | |||
<title>make a scraped pdf!</title> | |||
<link <link rel="stylesheet" href=""> | |||
<link href='http://fonts.googleapis.com/css?family=Josefin+Sans' rel='stylesheet' type='text/css'> | |||
</head> | |||
<body> | |||
Make a text file out of a scraped URL! | |||
<form action="../cgi-bin/urllib/makeScrapedText.cgi" name="inputForm"> | |||
URL: <input name="urlInput" class=""> | |||
<br /> | |||
<input type="submit" value="Submit"> | |||
</form> | |||
</body> | |||
</html> | |||
</source> | |||
<source lang="python"> | |||
#!/usr/bin/python | |||
#-*- coding:utf-8 -*- | |||
# tutorial on web scraping: http://www.youtube.com/watch?v=Ap_DlSrT-iE | |||
# using urllib & BeautifulSoup | |||
import cgi | |||
import cgitb; cgitb.enable() | |||
from urllib import urlopen | |||
from BeautifulSoup import BeautifulSoup | |||
import re | |||
# get the URL from the input form | |||
form = cgi.FieldStorage() # Grabs whatever input comes from form | |||
#TODO check if valid url/protocol given. urllib2 bit breaks if not. | |||
url = form.getvalue("urlInput", "http://eleanorg.org") | |||
#url = "http://eleanorg.org" | |||
# grab the hufpo RSS feed | |||
webpage = urlopen(url).read() | |||
# parse it with Beautiful Soup to extract p tags | |||
soup = BeautifulSoup(webpage) | |||
pSoup = soup.findAll('p') # this creates a list of all the titles | |||
# print message to user in the browser | |||
htmlHeader = """<!DOCTYPE html> | |||
<html> | |||
<head> | |||
<title>A form talking to a python script</title> | |||
</head> | |||
<body>""" | |||
htmlFooter = """ | |||
</body> | |||
</html>""" | |||
print "Content-Type: text/html" | |||
print | |||
print htmlHeader | |||
f = open("scraped.txt", "w") | |||
for item in pSoup: | |||
howMany = len(item.contents) | |||
for i in range(0,howMany): | |||
text = item.contents[i].string | |||
if text: | |||
print text | |||
f.write(text) | |||
else: | |||
print "Sorry, no paragraph content found at that URL." | |||
print "\n" | |||
f.close() | |||
print htmlFooter | |||
</source> | </source> |
Latest revision as of 15:39, 26 October 2012
More fun with Beautiful Soup. Extracting the useful bits from webpages, so that I can save them to text files or other fun things.
Scraping & saving to a text file
#!/usr/bin/python
#-*- coding:utf-8 -*-
# tutorial on web scraping: http://www.youtube.com/watch?v=Ap_DlSrT-iE
# using urllib & BeautifulSoup
from urllib import urlopen
from BeautifulSoup import BeautifulSoup
import re
# grab the hufpo RSS feed
webpage = urlopen('http://feeds.huffingtonpost.com/huffingtonpost/raw_feed').read()
# parse it with Beautiful Soup to extract titles & links
soup = BeautifulSoup(webpage)
titleSoup = soup.findAll('title') # this creates a list of all the titles
linkSoup = soup.findAll('link') # this creates a list of all the links
# reproduce the titles & links found on our own page:
print "Content-Type: text/html"
print
print """
<!DOCTYPE html>
<html>
<head>
<title></title>
<style type="text/css">
img { position: absolute; top:100px; left:100px; opacity: 0.3; width:500px; }
</style>
</head>
<body>"""
for i in range(1,5):
print titleSoup[i].contents[0] # uses the BS .contents method to grab the content out of the title tags
print linkSoup[i].get('href') # uses the BS .get() method to get the href.
print "\n"
print """
</body>
</html>"""
# now let's write those values to a text file:
f = open("scraped.txt", "w")
for i in range(1,5):
f.write(str(titleSoup[i].contents[0]))
f.write("\n\n")
# f.write(str(linkSoup[i].get('href'))
f.close()
Accepting input from the user
This html form lets the user input a URL. The associated script attempts to scrape it for anything useful, and saves this text to a text file as well as printing it onscreen.
<!DOCTYPE html>
<html>
<head>
<title>make a scraped pdf!</title>
<link <link rel="stylesheet" href="">
<link href='http://fonts.googleapis.com/css?family=Josefin+Sans' rel='stylesheet' type='text/css'>
</head>
<body>
Make a text file out of a scraped URL!
<form action="../cgi-bin/urllib/makeScrapedText.cgi" name="inputForm">
URL: <input name="urlInput" class="">
<br />
<input type="submit" value="Submit">
</form>
</body>
</html>
#!/usr/bin/python
#-*- coding:utf-8 -*-
# tutorial on web scraping: http://www.youtube.com/watch?v=Ap_DlSrT-iE
# using urllib & BeautifulSoup
import cgi
import cgitb; cgitb.enable()
from urllib import urlopen
from BeautifulSoup import BeautifulSoup
import re
# get the URL from the input form
form = cgi.FieldStorage() # Grabs whatever input comes from form
#TODO check if valid url/protocol given. urllib2 bit breaks if not.
url = form.getvalue("urlInput", "http://eleanorg.org")
#url = "http://eleanorg.org"
# grab the hufpo RSS feed
webpage = urlopen(url).read()
# parse it with Beautiful Soup to extract p tags
soup = BeautifulSoup(webpage)
pSoup = soup.findAll('p') # this creates a list of all the titles
# print message to user in the browser
htmlHeader = """<!DOCTYPE html>
<html>
<head>
<title>A form talking to a python script</title>
</head>
<body>"""
htmlFooter = """
</body>
</html>"""
print "Content-Type: text/html"
print
print htmlHeader
f = open("scraped.txt", "w")
for item in pSoup:
howMany = len(item.contents)
for i in range(0,howMany):
text = item.contents[i].string
if text:
print text
f.write(text)
else:
print "Sorry, no paragraph content found at that URL."
print "\n"
f.close()
print htmlFooter