User:Laura Macchini/project1: Difference between revisions
No edit summary |
No edit summary |
||
(7 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
<source lang= | Tumblsaurus is a web application that allows to browse tumblr in a semantic-associative way. | ||
Given a keyword, it will look up four related words (synonims or words used in the same context) and display the latest tumblr image results for that topic. | |||
It is possible to browse the results by clicking on the images or typing another keyword. | |||
===Video Documentation=== | |||
<video src="http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/media/tm.ogv" controls="controls"></video> | |||
===Link to the Rebelhuis website=== | |||
[[http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/index.html go!]] | |||
==Code Dump== | |||
<source lang=python> | |||
#!/usr/bin/python | #!/usr/bin/python | ||
import cgi, cgitb | import cgi, cgitb | ||
Line 10: | Line 22: | ||
print "Content-type: text/html" | print "Content-type: text/html" | ||
print | print | ||
form = cgi.FieldStorage() | |||
keyword = form.getvalue('query') | |||
print ''' | print ''' | ||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | ||
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en"> | <html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en"> | ||
<head> | |||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/> | |||
<script type="text/javascript" src="../jquery-1.2.6.min.js"></script> | |||
<script type="text/javascript" src="../slideshows.js"></script> | |||
<link rel="stylesheet" href="../style.css" /> | |||
<title> Tumblrsaurus </title> | |||
< | </head> | ||
</ | <body> | ||
<div id="wrapper"> | |||
<div id="header"> | |||
<div id="realdeal"> you looked for '''+str(keyword)+''' <br/> | |||
<br/> | |||
<img src="../arrow.png" alt="arrow"/> | |||
</div> | |||
</div> | |||
''' | |||
#global | #global | ||
Line 28: | Line 53: | ||
results = [] | results = [] | ||
searchdef = 'site:gutenberg.org '+keyword | searchdef = 'site:gutenberg.org '+str(keyword) | ||
extensions = ("jpg","png","bmp","gif") | extensions = ("jpg","png","bmp","gif") | ||
Line 35: | Line 60: | ||
def displayKeywords(): | def displayKeywords(): | ||
first = 1 | |||
targetfolder = './images/'+keyword | targetfolder = './images/'+keyword | ||
keyword_dir_list = os.listdir(targetfolder) | keyword_dir_list = os.listdir(targetfolder) | ||
print '''<div id="slideshow1">''' | |||
for url in keyword_dir_list: | for url in keyword_dir_list: | ||
targetfolderurl = targetfolder+"/"+url | if first == 1: | ||
first = 0 | |||
print " | targetfolderurl = targetfolder+"/"+url | ||
print '''</ | print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" class="active"/>' | ||
else: | |||
targetfolderurl = targetfolder+"/"+url | |||
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />' | |||
print '''</div>''' | |||
def displayImages(similars): | def displayImages(similars): | ||
displayKeywords() | displayKeywords() | ||
counter2 = 2 | |||
for sim in similars: | for sim in similars: | ||
print " | print '''<div id="slideshow'''+str(counter2)+'''">''' | ||
counter2 += 1 | |||
first = 1 | |||
targetfolder = './images/'+sim | targetfolder = './images/'+sim | ||
keyword_dir_list = os.listdir(targetfolder) | keyword_dir_list = os.listdir(targetfolder) | ||
for url in keyword_dir_list: | for url in keyword_dir_list: | ||
targetfolderurl = targetfolder+"/"+url | if first == 1: | ||
first = 0 | |||
print "< | targetfolderurl = targetfolder+"/"+url | ||
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" class="active"/>' | |||
else: | |||
targetfolderurl = targetfolder+"/"+url | |||
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />' | |||
print ''' </div> ''' | |||
print '''</div> </body> </html>''' | |||
Line 80: | Line 118: | ||
cmd = 'wget -q %s' %pipe | cmd = 'wget -q %s' %pipe | ||
cmd2 = 'chmod 755 %s' %filename | cmd2 = 'chmod 755 %s' %filename | ||
cmd3 = 'mogrify -resize 250 %s' %filename | |||
os.system(cmd) | os.system(cmd) | ||
os.system(cmd2) | os.system(cmd2) | ||
os.system(cmd3) | |||
os.chdir("../..") | os.chdir("../..") | ||
break | break | ||
Line 117: | Line 157: | ||
def tumblrMachine(similars): | def tumblrMachine(similars): | ||
tumblrDownloader(keyword) | tumblrDownloader(keyword) | ||
print similars | |||
for e in similars: | for e in similars: | ||
tumblrDownloader(e) | tumblrDownloader(e) | ||
Line 131: | Line 172: | ||
words = similar(text, keyword, 20) | words = similar(text, keyword, 20) | ||
if words: | if words: | ||
for w in words.split(): | ##!!!##############!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | ||
#if len(words.split()) < 3 counter+1 showless | |||
if len(words.split()) > 4: | |||
for w in words.split(): | |||
if len(w) > 3: | |||
similars.append(w) | |||
# print similars | similars.sort(cmp=bylength) | ||
similars.reverse() | |||
similars = similars[0:4] | |||
# print "no matches" | # print similars | ||
tumblrMachine(similars) | |||
else: | |||
# print "no matches" | |||
COUNTER += 1 | |||
showless() | |||
def gutenbergClean(url): | def gutenbergClean(url): | ||
Line 155: | Line 199: | ||
def googleQuery(astr,num): | def googleQuery(astr,num): | ||
os.chdir("..") | os.chdir("..") | ||
if keyword == "berlusconi": | |||
for i,h in enumerate(itertools.islice(hits(astr),num)): | print "please, do not enter swear words." | ||
elif keyword == "kill" or keyword == "killer": | |||
print "don't be silly" | |||
else: | |||
for i,h in enumerate(itertools.islice(hits(astr),num)): | |||
results.append(h) | |||
showless() | |||
def showless(): | def showless(): | ||
global COUNTER | global COUNTER | ||
urlG = results[COUNTER] | if results: | ||
urlG = results[COUNTER] | |||
gutenbergClean(urlG) | |||
else: | |||
print "sorry, no matches" | |||
googleQuery(searchdef,50) | googleQuery(searchdef,50) | ||
</source> | </source> |
Latest revision as of 10:31, 24 May 2011
Tumblsaurus is a web application that allows to browse tumblr in a semantic-associative way. Given a keyword, it will look up four related words (synonims or words used in the same context) and display the latest tumblr image results for that topic. It is possible to browse the results by clicking on the images or typing another keyword.
Video Documentation
<video src="http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/media/tm.ogv" controls="controls"></video>
Link to the Rebelhuis website
[go!]
Code Dump
#!/usr/bin/python
import cgi, cgitb
import json, urllib, sys, os, nltk, itertools, feedparser, urllib2, time, re
from urllib import urlopen
from nltk.text import *
from nltk.probability import FreqDist
from nltk.util import tokenwrap
print "Content-type: text/html"
print
form = cgi.FieldStorage()
keyword = form.getvalue('query')
print '''
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
<script type="text/javascript" src="../jquery-1.2.6.min.js"></script>
<script type="text/javascript" src="../slideshows.js"></script>
<link rel="stylesheet" href="../style.css" />
<title> Tumblrsaurus </title>
</head>
<body>
<div id="wrapper">
<div id="header">
<div id="realdeal"> you looked for '''+str(keyword)+''' <br/>
<br/>
<img src="../arrow.png" alt="arrow"/>
</div>
</div>
'''
#global
COUNTER = 1
results = []
searchdef = 'site:gutenberg.org '+str(keyword)
extensions = ("jpg","png","bmp","gif")
def bylength(word1, word2):
return len(word1) - len(word2)
def displayKeywords():
first = 1
targetfolder = './images/'+keyword
keyword_dir_list = os.listdir(targetfolder)
print '''<div id="slideshow1">'''
for url in keyword_dir_list:
if first == 1:
first = 0
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" class="active"/>'
else:
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />'
print '''</div>'''
def displayImages(similars):
displayKeywords()
counter2 = 2
for sim in similars:
print '''<div id="slideshow'''+str(counter2)+'''">'''
counter2 += 1
first = 1
targetfolder = './images/'+sim
keyword_dir_list = os.listdir(targetfolder)
for url in keyword_dir_list:
if first == 1:
first = 0
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" class="active"/>'
else:
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />'
print ''' </div> '''
print '''</div> </body> </html>'''
def tumblrDownloader(zoekwoord):
pipe = "http://pipes.yahoo.com/pipes/pipe.run?_id=zIR_uOFQ3hGQCl8j6ycw5g&_render=rss&q="+zoekwoord
feed = feedparser.parse(pipe)
# We need a folder to store the images
targetfolder = 'images/'+zoekwoord
if not os.path.isdir(targetfolder):
os.mkdir(targetfolder)
#if the extension is null add jpg
for e in feed.entries:
words = e.description.split()
for i in range(len(words)):
if words[i].endswith("img"):
pipe = words[i+1][5:-1]
filename = pipe.split('/')[-1]
for ext in extensions:
os.chdir(targetfolder)
if os.path.isfile(filename):
os.chdir("../..")
# print "file exists, skip"
else:
# print "downloading " + filename
cmd = 'wget -q %s' %pipe
cmd2 = 'chmod 755 %s' %filename
cmd3 = 'mogrify -resize 250 %s' %filename
os.system(cmd)
os.system(cmd2)
os.system(cmd3)
os.chdir("../..")
break
def hits(astr):
for start in itertools.count():
query = urllib.urlencode({'q':astr, 'rsz': 8, 'start': start*8})
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s'%(query)
search_results = urllib.urlopen(url)
results = json.loads(search_results.read())
data = results['responseData']
if data:
hits = data['results']
for h in hits:
yield h['url']
else:
raise StopIteration
def similar(self, word, num):
ret = []
if '_word_context_index' not in self.__dict__:
# print 'Building word-context index...'
self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower())
word = word.lower()
wci = self._word_context_index._word_to_contexts
if word in wci.conditions():
contexts = set(wci[word])
fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word)
words = fd.keys()[:num]
return tokenwrap(words)
# else:
# print "No matches"
def tumblrMachine(similars):
tumblrDownloader(keyword)
print similars
for e in similars:
tumblrDownloader(e)
displayImages(similars)
def getText(url):
similars = []
global COUNTER
filename = url.split('/')[-1]
# print "parsing " + url
raw = urlopen(url).read()
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)
words = similar(text, keyword, 20)
if words:
##!!!##############!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#if len(words.split()) < 3 counter+1 showless
if len(words.split()) > 4:
for w in words.split():
if len(w) > 3:
similars.append(w)
similars.sort(cmp=bylength)
similars.reverse()
similars = similars[0:4]
# print similars
tumblrMachine(similars)
else:
# print "no matches"
COUNTER += 1
showless()
def gutenbergClean(url):
filename = url.split('/')[-1]
if filename.endswith(".htm"):
filename = filename.split('.')[0]
gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.htm'
else:
filename = filename.split('.')[0]
gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.txt'
getText(gutenbergUrl)
def googleQuery(astr,num):
os.chdir("..")
if keyword == "berlusconi":
print "please, do not enter swear words."
elif keyword == "kill" or keyword == "killer":
print "don't be silly"
else:
for i,h in enumerate(itertools.islice(hits(astr),num)):
results.append(h)
showless()
def showless():
global COUNTER
if results:
urlG = results[COUNTER]
gutenbergClean(urlG)
else:
print "sorry, no matches"
googleQuery(searchdef,50)