User:Laura Macchini/project1: Difference between revisions
No edit summary |
No edit summary |
||
(5 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
Tumblsaurus is a web application that allows to browse tumblr in a semantic-associative way. | |||
Given a keyword, it will look up four related words (synonims or words used in the same context) and display the latest tumblr image results for that topic. | |||
It is possible to browse the results by clicking on the images or typing another keyword. | |||
===Video Documentation=== | |||
<video src="http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/media/tm.ogv" controls="controls"></video> | |||
===Link to the Rebelhuis website=== | |||
[[http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/index.html go!]] | |||
==Code Dump== | ==Code Dump== | ||
<source lang=python> | |||
<source lang= | |||
#!/usr/bin/python | #!/usr/bin/python | ||
import cgi, cgitb | import cgi, cgitb | ||
Line 12: | Line 22: | ||
print "Content-type: text/html" | print "Content-type: text/html" | ||
print | print | ||
form = cgi.FieldStorage() | |||
keyword = form.getvalue('query') | |||
print ''' | print ''' | ||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | ||
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en"> | <html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en"> | ||
<head> | |||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/> | |||
<script type="text/javascript" src="../jquery-1.2.6.min.js"></script> | |||
<script type="text/javascript" src="../slideshows.js"></script> | |||
<link rel="stylesheet" href="../style.css" /> | |||
<title> Tumblrsaurus </title> | |||
</head> | |||
<body> | |||
< | <div id="wrapper"> | ||
<div id="header"> | |||
</ | <div id="realdeal"> you looked for '''+str(keyword)+''' <br/> | ||
<br/> | |||
<img src="../arrow.png" alt="arrow"/> | |||
</div> | |||
</div> | |||
''' | |||
#global | #global | ||
Line 30: | Line 53: | ||
results = [] | results = [] | ||
searchdef = 'site:gutenberg.org '+keyword | searchdef = 'site:gutenberg.org '+str(keyword) | ||
extensions = ("jpg","png","bmp","gif") | extensions = ("jpg","png","bmp","gif") | ||
Line 37: | Line 60: | ||
def displayKeywords(): | def displayKeywords(): | ||
first = 1 | |||
targetfolder = './images/'+keyword | targetfolder = './images/'+keyword | ||
keyword_dir_list = os.listdir(targetfolder) | keyword_dir_list = os.listdir(targetfolder) | ||
print '''<div id="slideshow1">''' | |||
for url in keyword_dir_list: | for url in keyword_dir_list: | ||
targetfolderurl = targetfolder+"/"+url | if first == 1: | ||
first = 0 | |||
print " | targetfolderurl = targetfolder+"/"+url | ||
print '''</ | print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" class="active"/>' | ||
else: | |||
targetfolderurl = targetfolder+"/"+url | |||
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />' | |||
print '''</div>''' | |||
def displayImages(similars): | def displayImages(similars): | ||
displayKeywords() | displayKeywords() | ||
counter2 = 2 | |||
for sim in similars: | for sim in similars: | ||
print '''<div id="slideshow'''+str(counter2)+'''">''' | |||
counter2 += 1 | |||
first = 1 | |||
targetfolder = './images/'+sim | targetfolder = './images/'+sim | ||
keyword_dir_list = os.listdir(targetfolder) | keyword_dir_list = os.listdir(targetfolder) | ||
for url in keyword_dir_list: | for url in keyword_dir_list: | ||
targetfolderurl = targetfolder+"/"+url | if first == 1: | ||
first = 0 | |||
print "< | targetfolderurl = targetfolder+"/"+url | ||
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" class="active"/>' | |||
else: | |||
targetfolderurl = targetfolder+"/"+url | |||
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />' | |||
print ''' </div> ''' | |||
print '''</div> </body> </html>''' | |||
Line 83: | Line 118: | ||
cmd = 'wget -q %s' %pipe | cmd = 'wget -q %s' %pipe | ||
cmd2 = 'chmod 755 %s' %filename | cmd2 = 'chmod 755 %s' %filename | ||
cmd3 = 'mogrify -resize | cmd3 = 'mogrify -resize 250 %s' %filename | ||
os.system(cmd) | os.system(cmd) | ||
os.system(cmd2) | os.system(cmd2) | ||
Line 122: | Line 157: | ||
def tumblrMachine(similars): | def tumblrMachine(similars): | ||
tumblrDownloader(keyword) | tumblrDownloader(keyword) | ||
print similars | |||
for e in similars: | for e in similars: | ||
tumblrDownloader(e) | tumblrDownloader(e) | ||
Line 166: | Line 202: | ||
if keyword == "berlusconi": | if keyword == "berlusconi": | ||
print "please, do not enter swear words." | print "please, do not enter swear words." | ||
elif keyword == "kill" or keyword == "killer": | |||
print "don't be silly" | |||
else: | else: | ||
for i,h in enumerate(itertools.islice(hits(astr),num)): | for i,h in enumerate(itertools.islice(hits(astr),num)): | ||
Line 182: | Line 220: | ||
googleQuery(searchdef,50) | googleQuery(searchdef,50) | ||
</source> | </source> |
Latest revision as of 10:31, 24 May 2011
Tumblsaurus is a web application that allows to browse tumblr in a semantic-associative way. Given a keyword, it will look up four related words (synonims or words used in the same context) and display the latest tumblr image results for that topic. It is possible to browse the results by clicking on the images or typing another keyword.
Video Documentation
<video src="http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/media/tm.ogv" controls="controls"></video>
Link to the Rebelhuis website
[go!]
Code Dump
#!/usr/bin/python
import cgi, cgitb
import json, urllib, sys, os, nltk, itertools, feedparser, urllib2, time, re
from urllib import urlopen
from nltk.text import *
from nltk.probability import FreqDist
from nltk.util import tokenwrap
print "Content-type: text/html"
print
form = cgi.FieldStorage()
keyword = form.getvalue('query')
print '''
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
<script type="text/javascript" src="../jquery-1.2.6.min.js"></script>
<script type="text/javascript" src="../slideshows.js"></script>
<link rel="stylesheet" href="../style.css" />
<title> Tumblrsaurus </title>
</head>
<body>
<div id="wrapper">
<div id="header">
<div id="realdeal"> you looked for '''+str(keyword)+''' <br/>
<br/>
<img src="../arrow.png" alt="arrow"/>
</div>
</div>
'''
#global
COUNTER = 1
results = []
searchdef = 'site:gutenberg.org '+str(keyword)
extensions = ("jpg","png","bmp","gif")
def bylength(word1, word2):
return len(word1) - len(word2)
def displayKeywords():
first = 1
targetfolder = './images/'+keyword
keyword_dir_list = os.listdir(targetfolder)
print '''<div id="slideshow1">'''
for url in keyword_dir_list:
if first == 1:
first = 0
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" class="active"/>'
else:
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />'
print '''</div>'''
def displayImages(similars):
displayKeywords()
counter2 = 2
for sim in similars:
print '''<div id="slideshow'''+str(counter2)+'''">'''
counter2 += 1
first = 1
targetfolder = './images/'+sim
keyword_dir_list = os.listdir(targetfolder)
for url in keyword_dir_list:
if first == 1:
first = 0
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" class="active"/>'
else:
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />'
print ''' </div> '''
print '''</div> </body> </html>'''
def tumblrDownloader(zoekwoord):
pipe = "http://pipes.yahoo.com/pipes/pipe.run?_id=zIR_uOFQ3hGQCl8j6ycw5g&_render=rss&q="+zoekwoord
feed = feedparser.parse(pipe)
# We need a folder to store the images
targetfolder = 'images/'+zoekwoord
if not os.path.isdir(targetfolder):
os.mkdir(targetfolder)
#if the extension is null add jpg
for e in feed.entries:
words = e.description.split()
for i in range(len(words)):
if words[i].endswith("img"):
pipe = words[i+1][5:-1]
filename = pipe.split('/')[-1]
for ext in extensions:
os.chdir(targetfolder)
if os.path.isfile(filename):
os.chdir("../..")
# print "file exists, skip"
else:
# print "downloading " + filename
cmd = 'wget -q %s' %pipe
cmd2 = 'chmod 755 %s' %filename
cmd3 = 'mogrify -resize 250 %s' %filename
os.system(cmd)
os.system(cmd2)
os.system(cmd3)
os.chdir("../..")
break
def hits(astr):
for start in itertools.count():
query = urllib.urlencode({'q':astr, 'rsz': 8, 'start': start*8})
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s'%(query)
search_results = urllib.urlopen(url)
results = json.loads(search_results.read())
data = results['responseData']
if data:
hits = data['results']
for h in hits:
yield h['url']
else:
raise StopIteration
def similar(self, word, num):
ret = []
if '_word_context_index' not in self.__dict__:
# print 'Building word-context index...'
self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower())
word = word.lower()
wci = self._word_context_index._word_to_contexts
if word in wci.conditions():
contexts = set(wci[word])
fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word)
words = fd.keys()[:num]
return tokenwrap(words)
# else:
# print "No matches"
def tumblrMachine(similars):
tumblrDownloader(keyword)
print similars
for e in similars:
tumblrDownloader(e)
displayImages(similars)
def getText(url):
similars = []
global COUNTER
filename = url.split('/')[-1]
# print "parsing " + url
raw = urlopen(url).read()
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)
words = similar(text, keyword, 20)
if words:
##!!!##############!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#if len(words.split()) < 3 counter+1 showless
if len(words.split()) > 4:
for w in words.split():
if len(w) > 3:
similars.append(w)
similars.sort(cmp=bylength)
similars.reverse()
similars = similars[0:4]
# print similars
tumblrMachine(similars)
else:
# print "no matches"
COUNTER += 1
showless()
def gutenbergClean(url):
filename = url.split('/')[-1]
if filename.endswith(".htm"):
filename = filename.split('.')[0]
gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.htm'
else:
filename = filename.split('.')[0]
gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.txt'
getText(gutenbergUrl)
def googleQuery(astr,num):
os.chdir("..")
if keyword == "berlusconi":
print "please, do not enter swear words."
elif keyword == "kill" or keyword == "killer":
print "don't be silly"
else:
for i,h in enumerate(itertools.islice(hits(astr),num)):
results.append(h)
showless()
def showless():
global COUNTER
if results:
urlG = results[COUNTER]
gutenbergClean(urlG)
else:
print "sorry, no matches"
googleQuery(searchdef,50)