2008 3.08: Difference between revisions

From XPUB & Lens-Based wiki
(New page: 3.08 Print Plug-ins)
 
mNo edit summary
 
(2 intermediate revisions by the same user not shown)
Line 1: Line 1:
3.08 Print Plug-ins
3.08 Print Plug-ins
Fusing Two Webpages with [[BeautifulSoup]]
<source lang="python">
#!/usr/bin/python
import BeautifulSoup, urllib2, urlparse
def absolutize (url, soup):
resultset=soup.findAll(True, {'href': True})
for r in resultset:
href = r['href']
if not href.lower().startswith("http"):
r['href'] = urlparse.urljoin(url, href)
resultset=soup.findAll("img", {'src': True})
for r in resultset:
src = r['src']
if not src.lower().startswith("http"):
r['src'] = urlparse.urljoin(url, src)
### READ PAGES
url = "http://nytimes.com"
data=urllib2.urlopen(url)
soup=BeautifulSoup.BeautifulSoup(data)
absolutize(url, soup)
body=soup.find("body")
url2 = "http://volkskrant.nl"
data2=urllib2.urlopen(url2)
soup2=BeautifulSoup.BeautifulSoup(data2)
absolutize(url2, soup2)
body2=soup2.find("body")
### OUTPUT A FUSED PAGE
print """<div style="position: absolute">"""
print body.renderContents()
print """</div>"""
print """<div style="position: absolute">"""
print body2.renderContents()
print """</div>"""
</source>

Latest revision as of 10:20, 13 August 2008

3.08 Print Plug-ins

Fusing Two Webpages with BeautifulSoup

#!/usr/bin/python
import BeautifulSoup, urllib2, urlparse

def absolutize (url, soup):
	resultset=soup.findAll(True, {'href': True})
	for r in resultset:
		href = r['href']
		if not href.lower().startswith("http"):
			r['href'] = urlparse.urljoin(url, href)
	resultset=soup.findAll("img", {'src': True})
	for r in resultset:
		src = r['src']
		if not src.lower().startswith("http"):
			r['src'] = urlparse.urljoin(url, src)

### READ PAGES

url = "http://nytimes.com"
data=urllib2.urlopen(url)
soup=BeautifulSoup.BeautifulSoup(data)
absolutize(url, soup)
body=soup.find("body")

url2 = "http://volkskrant.nl"
data2=urllib2.urlopen(url2)
soup2=BeautifulSoup.BeautifulSoup(data2)
absolutize(url2, soup2)
body2=soup2.find("body")

### OUTPUT A FUSED PAGE

print """<div style="position: absolute">""" 
print body.renderContents()
print """</div>"""

print """<div style="position: absolute">""" 
print body2.renderContents()
print """</div>"""