User:Laura Macchini/project1: Difference between revisions
(Created page with "File:Naamloze presentatie.pdf Pecha Kucha presentation of my project") |
No edit summary |
||
(9 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
Tumblsaurus is a web application that allows to browse tumblr in a semantic-associative way. | |||
Given a keyword, it will look up four related words (synonims or words used in the same context) and display the latest tumblr image results for that topic. | |||
It is possible to browse the results by clicking on the images or typing another keyword. | |||
===Video Documentation=== | |||
<video src="http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/media/tm.ogv" controls="controls"></video> | |||
===Link to the Rebelhuis website=== | |||
[[http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/index.html go!]] | |||
==Code Dump== | |||
<source lang=python> | |||
#!/usr/bin/python | |||
import cgi, cgitb | |||
import json, urllib, sys, os, nltk, itertools, feedparser, urllib2, time, re | |||
from urllib import urlopen | |||
from nltk.text import * | |||
from nltk.probability import FreqDist | |||
from nltk.util import tokenwrap | |||
print "Content-type: text/html" | |||
print | |||
form = cgi.FieldStorage() | |||
keyword = form.getvalue('query') | |||
print ''' | |||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |||
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en"> | |||
<head> | |||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/> | |||
<script type="text/javascript" src="../jquery-1.2.6.min.js"></script> | |||
<script type="text/javascript" src="../slideshows.js"></script> | |||
<link rel="stylesheet" href="../style.css" /> | |||
<title> Tumblrsaurus </title> | |||
</head> | |||
<body> | |||
<div id="wrapper"> | |||
<div id="header"> | |||
<div id="realdeal"> you looked for '''+str(keyword)+''' <br/> | |||
<br/> | |||
<img src="../arrow.png" alt="arrow"/> | |||
</div> | |||
</div> | |||
''' | |||
#global | |||
COUNTER = 1 | |||
results = [] | |||
searchdef = 'site:gutenberg.org '+str(keyword) | |||
extensions = ("jpg","png","bmp","gif") | |||
def bylength(word1, word2): | |||
return len(word1) - len(word2) | |||
def displayKeywords(): | |||
first = 1 | |||
targetfolder = './images/'+keyword | |||
keyword_dir_list = os.listdir(targetfolder) | |||
print '''<div id="slideshow1">''' | |||
for url in keyword_dir_list: | |||
if first == 1: | |||
first = 0 | |||
targetfolderurl = targetfolder+"/"+url | |||
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" class="active"/>' | |||
else: | |||
targetfolderurl = targetfolder+"/"+url | |||
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />' | |||
print '''</div>''' | |||
def displayImages(similars): | |||
displayKeywords() | |||
counter2 = 2 | |||
for sim in similars: | |||
print '''<div id="slideshow'''+str(counter2)+'''">''' | |||
counter2 += 1 | |||
first = 1 | |||
targetfolder = './images/'+sim | |||
keyword_dir_list = os.listdir(targetfolder) | |||
for url in keyword_dir_list: | |||
if first == 1: | |||
first = 0 | |||
targetfolderurl = targetfolder+"/"+url | |||
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" class="active"/>' | |||
else: | |||
targetfolderurl = targetfolder+"/"+url | |||
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />' | |||
print ''' </div> ''' | |||
print '''</div> </body> </html>''' | |||
def tumblrDownloader(zoekwoord): | |||
pipe = "http://pipes.yahoo.com/pipes/pipe.run?_id=zIR_uOFQ3hGQCl8j6ycw5g&_render=rss&q="+zoekwoord | |||
feed = feedparser.parse(pipe) | |||
# We need a folder to store the images | |||
targetfolder = 'images/'+zoekwoord | |||
if not os.path.isdir(targetfolder): | |||
os.mkdir(targetfolder) | |||
#if the extension is null add jpg | |||
for e in feed.entries: | |||
words = e.description.split() | |||
for i in range(len(words)): | |||
if words[i].endswith("img"): | |||
pipe = words[i+1][5:-1] | |||
filename = pipe.split('/')[-1] | |||
for ext in extensions: | |||
os.chdir(targetfolder) | |||
if os.path.isfile(filename): | |||
os.chdir("../..") | |||
# print "file exists, skip" | |||
else: | |||
# print "downloading " + filename | |||
cmd = 'wget -q %s' %pipe | |||
cmd2 = 'chmod 755 %s' %filename | |||
cmd3 = 'mogrify -resize 250 %s' %filename | |||
os.system(cmd) | |||
os.system(cmd2) | |||
os.system(cmd3) | |||
os.chdir("../..") | |||
break | |||
def hits(astr): | |||
for start in itertools.count(): | |||
query = urllib.urlencode({'q':astr, 'rsz': 8, 'start': start*8}) | |||
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s'%(query) | |||
search_results = urllib.urlopen(url) | |||
results = json.loads(search_results.read()) | |||
data = results['responseData'] | |||
if data: | |||
hits = data['results'] | |||
for h in hits: | |||
yield h['url'] | |||
else: | |||
raise StopIteration | |||
def similar(self, word, num): | |||
ret = [] | |||
if '_word_context_index' not in self.__dict__: | |||
# print 'Building word-context index...' | |||
self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) | |||
word = word.lower() | |||
wci = self._word_context_index._word_to_contexts | |||
if word in wci.conditions(): | |||
contexts = set(wci[word]) | |||
fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) | |||
words = fd.keys()[:num] | |||
return tokenwrap(words) | |||
# else: | |||
# print "No matches" | |||
def tumblrMachine(similars): | |||
tumblrDownloader(keyword) | |||
print similars | |||
for e in similars: | |||
tumblrDownloader(e) | |||
displayImages(similars) | |||
def getText(url): | |||
similars = [] | |||
global COUNTER | |||
filename = url.split('/')[-1] | |||
# print "parsing " + url | |||
raw = urlopen(url).read() | |||
tokens = nltk.word_tokenize(raw) | |||
text = nltk.Text(tokens) | |||
words = similar(text, keyword, 20) | |||
if words: | |||
##!!!##############!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |||
#if len(words.split()) < 3 counter+1 showless | |||
if len(words.split()) > 4: | |||
for w in words.split(): | |||
if len(w) > 3: | |||
similars.append(w) | |||
similars.sort(cmp=bylength) | |||
similars.reverse() | |||
similars = similars[0:4] | |||
# print similars | |||
tumblrMachine(similars) | |||
else: | |||
# print "no matches" | |||
COUNTER += 1 | |||
showless() | |||
def gutenbergClean(url): | |||
filename = url.split('/')[-1] | |||
if filename.endswith(".htm"): | |||
filename = filename.split('.')[0] | |||
gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.htm' | |||
else: | |||
filename = filename.split('.')[0] | |||
gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.txt' | |||
getText(gutenbergUrl) | |||
def googleQuery(astr,num): | |||
os.chdir("..") | |||
if keyword == "berlusconi": | |||
print "please, do not enter swear words." | |||
elif keyword == "kill" or keyword == "killer": | |||
print "don't be silly" | |||
else: | |||
for i,h in enumerate(itertools.islice(hits(astr),num)): | |||
results.append(h) | |||
showless() | |||
def showless(): | |||
global COUNTER | |||
if results: | |||
urlG = results[COUNTER] | |||
gutenbergClean(urlG) | |||
else: | |||
print "sorry, no matches" | |||
googleQuery(searchdef,50) | |||
</source> |
Latest revision as of 10:31, 24 May 2011
Tumblsaurus is a web application that allows to browse tumblr in a semantic-associative way. Given a keyword, it will look up four related words (synonims or words used in the same context) and display the latest tumblr image results for that topic. It is possible to browse the results by clicking on the images or typing another keyword.
Video Documentation
<video src="http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/media/tm.ogv" controls="controls"></video>
Link to the Rebelhuis website
[go!]
Code Dump
#!/usr/bin/python
import cgi, cgitb
import json, urllib, sys, os, nltk, itertools, feedparser, urllib2, time, re
from urllib import urlopen
from nltk.text import *
from nltk.probability import FreqDist
from nltk.util import tokenwrap
print "Content-type: text/html"
print
form = cgi.FieldStorage()
keyword = form.getvalue('query')
print '''
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
<script type="text/javascript" src="../jquery-1.2.6.min.js"></script>
<script type="text/javascript" src="../slideshows.js"></script>
<link rel="stylesheet" href="../style.css" />
<title> Tumblrsaurus </title>
</head>
<body>
<div id="wrapper">
<div id="header">
<div id="realdeal"> you looked for '''+str(keyword)+''' <br/>
<br/>
<img src="../arrow.png" alt="arrow"/>
</div>
</div>
'''
#global
COUNTER = 1
results = []
searchdef = 'site:gutenberg.org '+str(keyword)
extensions = ("jpg","png","bmp","gif")
def bylength(word1, word2):
return len(word1) - len(word2)
def displayKeywords():
first = 1
targetfolder = './images/'+keyword
keyword_dir_list = os.listdir(targetfolder)
print '''<div id="slideshow1">'''
for url in keyword_dir_list:
if first == 1:
first = 0
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" class="active"/>'
else:
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />'
print '''</div>'''
def displayImages(similars):
displayKeywords()
counter2 = 2
for sim in similars:
print '''<div id="slideshow'''+str(counter2)+'''">'''
counter2 += 1
first = 1
targetfolder = './images/'+sim
keyword_dir_list = os.listdir(targetfolder)
for url in keyword_dir_list:
if first == 1:
first = 0
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" class="active"/>'
else:
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />'
print ''' </div> '''
print '''</div> </body> </html>'''
def tumblrDownloader(zoekwoord):
pipe = "http://pipes.yahoo.com/pipes/pipe.run?_id=zIR_uOFQ3hGQCl8j6ycw5g&_render=rss&q="+zoekwoord
feed = feedparser.parse(pipe)
# We need a folder to store the images
targetfolder = 'images/'+zoekwoord
if not os.path.isdir(targetfolder):
os.mkdir(targetfolder)
#if the extension is null add jpg
for e in feed.entries:
words = e.description.split()
for i in range(len(words)):
if words[i].endswith("img"):
pipe = words[i+1][5:-1]
filename = pipe.split('/')[-1]
for ext in extensions:
os.chdir(targetfolder)
if os.path.isfile(filename):
os.chdir("../..")
# print "file exists, skip"
else:
# print "downloading " + filename
cmd = 'wget -q %s' %pipe
cmd2 = 'chmod 755 %s' %filename
cmd3 = 'mogrify -resize 250 %s' %filename
os.system(cmd)
os.system(cmd2)
os.system(cmd3)
os.chdir("../..")
break
def hits(astr):
for start in itertools.count():
query = urllib.urlencode({'q':astr, 'rsz': 8, 'start': start*8})
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s'%(query)
search_results = urllib.urlopen(url)
results = json.loads(search_results.read())
data = results['responseData']
if data:
hits = data['results']
for h in hits:
yield h['url']
else:
raise StopIteration
def similar(self, word, num):
ret = []
if '_word_context_index' not in self.__dict__:
# print 'Building word-context index...'
self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower())
word = word.lower()
wci = self._word_context_index._word_to_contexts
if word in wci.conditions():
contexts = set(wci[word])
fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word)
words = fd.keys()[:num]
return tokenwrap(words)
# else:
# print "No matches"
def tumblrMachine(similars):
tumblrDownloader(keyword)
print similars
for e in similars:
tumblrDownloader(e)
displayImages(similars)
def getText(url):
similars = []
global COUNTER
filename = url.split('/')[-1]
# print "parsing " + url
raw = urlopen(url).read()
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)
words = similar(text, keyword, 20)
if words:
##!!!##############!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#if len(words.split()) < 3 counter+1 showless
if len(words.split()) > 4:
for w in words.split():
if len(w) > 3:
similars.append(w)
similars.sort(cmp=bylength)
similars.reverse()
similars = similars[0:4]
# print similars
tumblrMachine(similars)
else:
# print "no matches"
COUNTER += 1
showless()
def gutenbergClean(url):
filename = url.split('/')[-1]
if filename.endswith(".htm"):
filename = filename.split('.')[0]
gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.htm'
else:
filename = filename.split('.')[0]
gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.txt'
getText(gutenbergUrl)
def googleQuery(astr,num):
os.chdir("..")
if keyword == "berlusconi":
print "please, do not enter swear words."
elif keyword == "kill" or keyword == "killer":
print "don't be silly"
else:
for i,h in enumerate(itertools.islice(hits(astr),num)):
results.append(h)
showless()
def showless():
global COUNTER
if results:
urlG = results[COUNTER]
gutenbergClean(urlG)
else:
print "sorry, no matches"
googleQuery(searchdef,50)