2008 3.08
Revision as of 10:20, 13 August 2008 by Michael Murtaugh (talk | contribs)
3.08 Print Plug-ins
Fusing Two Webpages with Beautiful Soup
#!/usr/bin/python
import BeautifulSoup, urllib2, urlparse
def absolutize (url, soup):
resultset=soup.findAll(True, {'href': True})
for r in resultset:
href = r['href']
if not href.lower().startswith("http"):
r['href'] = urlparse.urljoin(url, href)
resultset=soup.findAll("img", {'src': True})
for r in resultset:
src = r['src']
if not src.lower().startswith("http"):
r['src'] = urlparse.urljoin(url, src)
### READ PAGES
url = "http://nytimes.com"
data=urllib2.urlopen(url)
soup=BeautifulSoup.BeautifulSoup(data)
absolutize(url, soup)
body=soup.find("body")
url2 = "http://volkskrant.nl"
data2=urllib2.urlopen(url2)
soup2=BeautifulSoup.BeautifulSoup(data2)
absolutize(url2, soup2)
body2=soup2.find("body")
### OUTPUT A FUSED PAGE
print """<div style="position: absolute">"""
print body.renderContents()
print """</div>"""
print """<div style="position: absolute">"""
print body2.renderContents()
print """</div>"""