2008 3.08

The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

3.08 Print Plug-ins

Fusing Two Webpages with BeautifulSoup

#!/usr/bin/python
import BeautifulSoup, urllib2, urlparse

def absolutize (url, soup):
	resultset=soup.findAll(True, {'href': True})
	for r in resultset:
		href = r['href']
		if not href.lower().startswith("http"):
			r['href'] = urlparse.urljoin(url, href)
	resultset=soup.findAll("img", {'src': True})
	for r in resultset:
		src = r['src']
		if not src.lower().startswith("http"):
			r['src'] = urlparse.urljoin(url, src)

### READ PAGES

url = "http://nytimes.com"
data=urllib2.urlopen(url)
soup=BeautifulSoup.BeautifulSoup(data)
absolutize(url, soup)
body=soup.find("body")

url2 = "http://volkskrant.nl"
data2=urllib2.urlopen(url2)
soup2=BeautifulSoup.BeautifulSoup(data2)
absolutize(url2, soup2)
body2=soup2.find("body")

### OUTPUT A FUSED PAGE

print """<div style="position: absolute">""" 
print body.renderContents()
print """</div>"""

print """<div style="position: absolute">""" 
print body2.renderContents()
print """</div>"""