User:Emanuele Bonetti/ProblemSet2.2
< User:Emanuele Bonetti
Revision as of 20:34, 23 September 2010 by Migratebot (talk | contribs) (Created page with "Web comic
<source lang="text"> import urllib2, html5lib, urlparse
import codecs
import re
from pprint import pprint
import random
def openURL (url):
"""
ret...")
Web comic
import urllib2, html5lib, urlparse
import codecs
import re
from pprint import pprint
import random
def openURL (url):
"""
returns (page, url)
sets user_agent and resolves possible redirection
returned url may be different than initial url in the case of a redirect
"""
request = urllib2.Request(url)
user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5"
request.add_header("User-Agent", user_agent)
pagefile=urllib2.urlopen(request)
realurl = pagefile.geturl()
return (pagefile, realurl)
def getImagesFromWeb (url):
"""
returns: a list of absolute URLs of the src's found in <img> tags at the given URL
requires: URL should be of an HTML page
"""
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
(f, url) = openURL(url)
tree = parser.parse(f)
tree.normalize()
ret = []
for img in tree.getElementsByTagName("img"):
src = img.getAttribute("src")
if not src.startswith("http://"):
src = urlparse.urljoin(url, src)
ret.append(src)
f.close()
return ret
t = codecs.open("redcircle2.txt","r","utf-8").read() #return a file object and read the contents
#print t
words = re.findall(r"\b[a-z'-]+\b",t, re.I) #this pattern match each word (one or more character from a to z or ' or -) - re.I -> ingnore capital case
print "<table border=\"1\">"
print "<tr>"
n=0
l=0
for word in words:
url='http://www.flickr.com/search/?q='+word+'&l=cc&ss=0&ct=0&mt=all&w=all&adv=1'
#url='http://images.google.nl/images?hl=en&source=hp&q=stuff&aql=&oq=&um=1&ie=UTF-8&sa=N&tab=wi#start=0&imgtbs=lt&tbo=1&imgcolor=red&imgtype=lineart&imgc=specific'
images=getImagesFromWeb(url)
print "<td width=100px style=\"height:75px; ;\" background=\""+images[5]+"\">"
print "</td>"
if n == 3:
print "</tr>"
n=0
l=l+1
else:
n=n+1
if l==7:
print "</tr>"
print "</table>"
print "<br />"
print "<table border=\"1\">"
print "<tr>"
l=0
n=n+1