User:Emanuele Bonetti/ProblemSet2.2

From XPUB & Lens-Based wiki

Web comic

import urllib2, html5lib, urlparse
import codecs
import re
from pprint import pprint
import random


def openURL (url):
    """
    returns (page, url)
    sets user_agent and resolves possible redirection
    returned url may be different than initial url in the case of a redirect
    """    
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5"
    request.add_header("User-Agent", user_agent)
    pagefile=urllib2.urlopen(request)
    realurl = pagefile.geturl()
    return (pagefile, realurl)

def getImagesFromWeb (url):
    """
    returns: a list of absolute URLs of the src's found in <img> tags at the given URL
    requires: URL should be of an HTML page
    """
    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
    (f, url) = openURL(url)
    tree = parser.parse(f)
    tree.normalize()
    ret = []    
    for img in tree.getElementsByTagName("img"):
        src = img.getAttribute("src")
        if not src.startswith("http://"):
            src = urlparse.urljoin(url, src)
        ret.append(src)
    f.close()
    return ret

t = codecs.open("redcircle2.txt","r","utf-8").read() #return a file object and read the contents
#print t

words = re.findall(r"\b[a-z'-]+\b",t, re.I) #this pattern match each word (one or more character from a to z or ' or -) - re.I -> ingnore capital case
print "<table border=\"1\">"
print "<tr>"
n=0
l=0


for word in words:	

	url='http://www.flickr.com/search/?q='+word+'&l=cc&ss=0&ct=0&mt=all&w=all&adv=1'
	#url='http://images.google.nl/images?hl=en&source=hp&q=stuff&aql=&oq=&um=1&ie=UTF-8&sa=N&tab=wi#start=0&imgtbs=lt&tbo=1&imgcolor=red&imgtype=lineart&imgc=specific'
	images=getImagesFromWeb(url)
	print "<td width=100px style=\"height:75px; ;\" background=\""+images[5]+"\">"
	print "</td>"
	if n == 3:
		print "</tr>"
		n=0
		l=l+1
	else:
		n=n+1
	
	if l==7:
		print "</tr>"		
		print "</table>"
		print "<br />"
		print "<table border=\"1\">"
		print "<tr>"
		l=0
		n=n+1


Attachments