2008 3.08: Difference between revisions
(New page: 3.08 Print Plug-ins) |
No edit summary |
||
Line 1: | Line 1: | ||
3.08 Print Plug-ins | 3.08 Print Plug-ins | ||
Fusing Two Webpages with Beautfiul Soup | |||
<source lang="python"> | |||
#!/usr/bin/python | |||
import BeautifulSoup, urllib2, urlparse | |||
def absolutize (url, soup): | |||
resultset=soup.findAll(True, {'href': True}) | |||
for r in resultset: | |||
href = r['href'] | |||
if not href.lower().startswith("http"): | |||
r['href'] = urlparse.urljoin(url, href) | |||
resultset=soup.findAll("img", {'src': True}) | |||
for r in resultset: | |||
src = r['src'] | |||
if not src.lower().startswith("http"): | |||
r['src'] = urlparse.urljoin(url, src) | |||
### READ PAGES | |||
url = "http://nytimes.com" | |||
data=urllib2.urlopen(url) | |||
soup=BeautifulSoup.BeautifulSoup(data) | |||
absolutize(url, soup) | |||
body=soup.find("body") | |||
url2 = "http://volkskrant.nl" | |||
data2=urllib2.urlopen(url2) | |||
soup2=BeautifulSoup.BeautifulSoup(data2) | |||
absolutize(url2, soup2) | |||
body2=soup2.find("body") | |||
### OUTPUT A FUSED PAGE | |||
print """<div style="position: absolute">""" | |||
print body.renderContents() | |||
print """</div>""" | |||
print """<div style="position: absolute">""" | |||
print body2.renderContents() | |||
print """</div>""" | |||
</source> |
Revision as of 11:11, 5 June 2008
3.08 Print Plug-ins
Fusing Two Webpages with Beautfiul Soup
#!/usr/bin/python
import BeautifulSoup, urllib2, urlparse
def absolutize (url, soup):
resultset=soup.findAll(True, {'href': True})
for r in resultset:
href = r['href']
if not href.lower().startswith("http"):
r['href'] = urlparse.urljoin(url, href)
resultset=soup.findAll("img", {'src': True})
for r in resultset:
src = r['src']
if not src.lower().startswith("http"):
r['src'] = urlparse.urljoin(url, src)
### READ PAGES
url = "http://nytimes.com"
data=urllib2.urlopen(url)
soup=BeautifulSoup.BeautifulSoup(data)
absolutize(url, soup)
body=soup.find("body")
url2 = "http://volkskrant.nl"
data2=urllib2.urlopen(url2)
soup2=BeautifulSoup.BeautifulSoup(data2)
absolutize(url2, soup2)
body2=soup2.find("body")
### OUTPUT A FUSED PAGE
print """<div style="position: absolute">"""
print body.renderContents()
print """</div>"""
print """<div style="position: absolute">"""
print body2.renderContents()
print """</div>"""