User:Laura Macchini/project1: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
No edit summary
 
(5 intermediate revisions by the same user not shown)
Line 1: Line 1:
Tumblsaurus is a web application that allows to browse tumblr in a semantic-associative way.
Given a keyword, it will look up four related words (synonims or words used in the same context) and display the latest tumblr image results for that topic.
It is possible to browse the results by clicking on the images or typing another keyword.
===Video Documentation===
<video src="http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/media/tm.ogv" controls="controls"></video>
===Link to the Rebelhuis website===
[[http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/index.html  go!]]
==Code Dump==
==Code Dump==
 
<source lang=python>
<source lang="python">
#!/usr/bin/python
#!/usr/bin/python
import cgi, cgitb  
import cgi, cgitb  
Line 12: Line 22:
print "Content-type: text/html"
print "Content-type: text/html"
print
print
form = cgi.FieldStorage()
keyword = form.getvalue('query')


print '''
print '''
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">  
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en"> '''
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
<script type="text/javascript" src="../jquery-1.2.6.min.js"></script>
<script type="text/javascript" src="../slideshows.js"></script>
<link rel="stylesheet" href="../style.css" />
 
<title> Tumblrsaurus </title>
</head>


print '''<head>  
<body>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" />  
<div id="wrapper">
<title> Tumblrsaurus </title>  
<div id="header">
</head> '''
<div id="realdeal"> you looked for '''+str(keyword)+''' <br/>
<br/>
<img src="../arrow.png" alt="arrow"/>
</div>
</div>
'''


print '''<body>'''
form = cgi.FieldStorage()
keyword = form.getvalue('query')


#global
#global
Line 30: Line 53:


results = []
results = []
searchdef = 'site:gutenberg.org '+keyword
searchdef = 'site:gutenberg.org '+str(keyword)
extensions = ("jpg","png","bmp","gif")
extensions = ("jpg","png","bmp","gif")


Line 37: Line 60:


def displayKeywords():
def displayKeywords():
# print "displayimages called"
first = 1
targetfolder = './images/'+keyword
targetfolder = './images/'+keyword
keyword_dir_list = os.listdir(targetfolder)
keyword_dir_list = os.listdir(targetfolder)
print '''<div id="slideshow1">'''
for url in keyword_dir_list:
for url in keyword_dir_list:
targetfolderurl = targetfolder+"/"+url
if first == 1:
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />'
first = 0
print "<br/>"
targetfolderurl = targetfolder+"/"+url
print '''</body>  </html>'''
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" class="active"/>'
else:
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />'
print '''</div>'''


def displayImages(similars):
def displayImages(similars):
# RESIZE THE **** IMAGES
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
displayKeywords()
displayKeywords()
counter2 = 2
for sim in similars:
for sim in similars:
print '''<div id="slideshow'''+str(counter2)+'''">'''
counter2 += 1
first = 1
targetfolder = './images/'+sim
targetfolder = './images/'+sim
keyword_dir_list = os.listdir(targetfolder)
keyword_dir_list = os.listdir(targetfolder)
for url in keyword_dir_list:
for url in keyword_dir_list:
targetfolderurl = targetfolder+"/"+url
if first == 1:
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />'
first = 0
print "<br/>"
targetfolderurl = targetfolder+"/"+url
print '''</body> </html>'''
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" class="active"/>'
else:
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />'
print ''' </div> '''
print '''</div> </body> </html>'''




Line 83: Line 118:
cmd = 'wget -q %s' %pipe
cmd = 'wget -q %s' %pipe
cmd2 = 'chmod 755 %s' %filename
cmd2 = 'chmod 755 %s' %filename
cmd3 = 'mogrify -resize 300 %s' %filename
cmd3 = 'mogrify -resize 250 %s' %filename
os.system(cmd)
os.system(cmd)
os.system(cmd2)
os.system(cmd2)
Line 122: Line 157:
def tumblrMachine(similars):
def tumblrMachine(similars):
tumblrDownloader(keyword)
tumblrDownloader(keyword)
print similars
for e in similars:
for e in similars:
tumblrDownloader(e)
tumblrDownloader(e)
Line 166: Line 202:
if keyword == "berlusconi":
if keyword == "berlusconi":
print "please, do not enter swear words."
print "please, do not enter swear words."
elif keyword == "kill" or keyword == "killer":
print "don't be silly"
else:
else:
for i,h in enumerate(itertools.islice(hits(astr),num)):
for i,h in enumerate(itertools.islice(hits(astr),num)):
Line 182: Line 220:


googleQuery(searchdef,50)
googleQuery(searchdef,50)


</source>
</source>

Latest revision as of 10:31, 24 May 2011

Tumblsaurus is a web application that allows to browse tumblr in a semantic-associative way. Given a keyword, it will look up four related words (synonims or words used in the same context) and display the latest tumblr image results for that topic. It is possible to browse the results by clicking on the images or typing another keyword.

Video Documentation

<video src="http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/media/tm.ogv" controls="controls"></video>


Link to the Rebelhuis website

[go!]

Code Dump

#!/usr/bin/python
import cgi, cgitb 
import json, urllib, sys, os, nltk, itertools, feedparser, urllib2, time, re
from urllib import urlopen
from nltk.text import *
from nltk.probability import FreqDist
from nltk.util import tokenwrap

print "Content-type: text/html"
print

form = cgi.FieldStorage() 
keyword = form.getvalue('query')

print '''
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/> 
<script type="text/javascript" src="../jquery-1.2.6.min.js"></script>
<script type="text/javascript" src="../slideshows.js"></script>
<link rel="stylesheet" href="../style.css" /> 

<title> Tumblrsaurus </title> 
</head>

<body>
<div id="wrapper">
<div id="header"> 
	<div id="realdeal"> you looked for '''+str(keyword)+''' <br/>
	<br/>
	<img src="../arrow.png" alt="arrow"/>
	</div>
</div>
'''


#global
COUNTER = 1

results = []
searchdef = 'site:gutenberg.org '+str(keyword)
extensions = ("jpg","png","bmp","gif")

def bylength(word1, word2):
	return len(word1) - len(word2)

def displayKeywords():
	first = 1
	targetfolder = './images/'+keyword
	keyword_dir_list = os.listdir(targetfolder)
	print '''<div id="slideshow1">'''
	for url in keyword_dir_list:
		if first == 1:
			first = 0
			targetfolderurl = targetfolder+"/"+url
			print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" class="active"/>'
		else:
			targetfolderurl = targetfolder+"/"+url
			print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />'
	print '''</div>'''

def displayImages(similars):
	displayKeywords()
	counter2 = 2
	for sim in similars:
		print '''<div id="slideshow'''+str(counter2)+'''">'''
		counter2 += 1
		first = 1
		targetfolder = './images/'+sim
		keyword_dir_list = os.listdir(targetfolder)
		for url in keyword_dir_list:
			if first == 1:
				first = 0
				targetfolderurl = targetfolder+"/"+url
				print '<img src="../'+targetfolderurl+'" alt="'+sim+'" class="active"/>'
			else:
				targetfolderurl = targetfolder+"/"+url
				print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />'
		print ''' </div> '''
	print '''</div> </body> </html>'''


def tumblrDownloader(zoekwoord):
	pipe = "http://pipes.yahoo.com/pipes/pipe.run?_id=zIR_uOFQ3hGQCl8j6ycw5g&_render=rss&q="+zoekwoord
	feed = feedparser.parse(pipe)
	# We need a folder to store the images
	targetfolder = 'images/'+zoekwoord
	if not os.path.isdir(targetfolder):
		os.mkdir(targetfolder)
	#if the extension is null add jpg
	for e in feed.entries:
		words = e.description.split()
		for i in range(len(words)):
			if words[i].endswith("img"):
		 		pipe = words[i+1][5:-1]
		 		filename = pipe.split('/')[-1]
		 		for ext in extensions:
		 			os.chdir(targetfolder)
					if os.path.isfile(filename):
						os.chdir("../..")
#						print "file exists, skip"
					else:
#						print "downloading " + filename
						cmd = 'wget -q %s' %pipe
						cmd2 = 'chmod 755 %s' %filename
						cmd3 = 'mogrify -resize 250 %s' %filename
						os.system(cmd)
						os.system(cmd2)
			 			os.system(cmd3)
						os.chdir("../..")
			 		break

def hits(astr):
	for start in itertools.count():
		query = urllib.urlencode({'q':astr, 'rsz': 8, 'start': start*8})
		url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s'%(query)
		search_results = urllib.urlopen(url)
		results = json.loads(search_results.read())
		data = results['responseData']
		if data:
			hits = data['results']
			for h in hits:
				yield h['url']
		else:
			raise StopIteration


def similar(self, word, num):
	ret = []
	if '_word_context_index' not in self.__dict__: 
#		print 'Building word-context index...' 
		self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) 
	word = word.lower() 
	wci = self._word_context_index._word_to_contexts 
	if word in wci.conditions(): 
		contexts = set(wci[word]) 
		fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) 
		words = fd.keys()[:num] 
		return tokenwrap(words)
#	else:
#		print "No matches" 

def tumblrMachine(similars):
	tumblrDownloader(keyword)
	print similars
	for e in similars:
		tumblrDownloader(e)
	displayImages(similars)

def getText(url):
	similars = []
	global COUNTER
	filename = url.split('/')[-1]
#	print "parsing " + url
	raw = urlopen(url).read()
	tokens = nltk.word_tokenize(raw)
	text = nltk.Text(tokens)
	words = similar(text, keyword, 20)
	if words:
	##!!!##############!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
	#if len(words.split()) < 3 counter+1 showless
		if len(words.split()) > 4:
			for w in words.split():
				if len(w) > 3:
					similars.append(w)
			similars.sort(cmp=bylength)
			similars.reverse()
			similars = similars[0:4]
	#		print similars
			tumblrMachine(similars)
		else:
	#		print "no matches"
			COUNTER += 1
			showless()

def gutenbergClean(url):
	filename = url.split('/')[-1]
	if filename.endswith(".htm"):
		filename = filename.split('.')[0]
		gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.htm'
	else:
		filename = filename.split('.')[0]
		gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.txt'
	getText(gutenbergUrl)

def googleQuery(astr,num):
	os.chdir("..") 
	if keyword == "berlusconi":
		print "please, do not enter swear words."
	elif keyword == "kill" or keyword == "killer":
		print "don't be silly"
	else:
		for i,h in enumerate(itertools.islice(hits(astr),num)):
			results.append(h)
		showless()

def showless():
	global COUNTER
	if results:
		urlG = results[COUNTER]
		gutenbergClean(urlG)
	else:
		print "sorry, no matches"



googleQuery(searchdef,50)