2008 3.08

From XPUB & Lens-Based wiki

3.08 Print Plug-ins

Fusing Two Webpages with BeautifulSoup

#!/usr/bin/python
import BeautifulSoup, urllib2, urlparse

def absolutize (url, soup):
	resultset=soup.findAll(True, {'href': True})
	for r in resultset:
		href = r['href']
		if not href.lower().startswith("http"):
			r['href'] = urlparse.urljoin(url, href)
	resultset=soup.findAll("img", {'src': True})
	for r in resultset:
		src = r['src']
		if not src.lower().startswith("http"):
			r['src'] = urlparse.urljoin(url, src)

### READ PAGES

url = "http://nytimes.com"
data=urllib2.urlopen(url)
soup=BeautifulSoup.BeautifulSoup(data)
absolutize(url, soup)
body=soup.find("body")

url2 = "http://volkskrant.nl"
data2=urllib2.urlopen(url2)
soup2=BeautifulSoup.BeautifulSoup(data2)
absolutize(url2, soup2)
body2=soup2.find("body")

### OUTPUT A FUSED PAGE

print """<div style="position: absolute">""" 
print body.renderContents()
print """</div>"""

print """<div style="position: absolute">""" 
print body2.renderContents()
print """</div>"""