User:Laura Macchini/project1: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
No edit summary
Line 4: Line 4:


===Video Documentation===
===Video Documentation===
<video src="http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/media/tm.ogv" controls="controls"></video>


[[Media:http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/media/tm.ogv]]


===Link to the Rebelhuis website===
===Link to the Rebelhuis website===

Revision as of 11:49, 8 March 2011

Tumblsaurus is a web application that allows to browse tumblr in a semantic-associative way. Given a keyword, it will look up four related words (synonims or words used in the same context) and display the latest tumblr image results for that topic. It is possible to browse the results by clicking on the images or typing another keyword.

Video Documentation

<video src="http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/media/tm.ogv" controls="controls"></video>


Link to the Rebelhuis website

[| go!]

Code Dump

#!/usr/bin/python
import cgi, cgitb 
import json, urllib, sys, os, nltk, itertools, feedparser, urllib2, time, re
from urllib import urlopen
from nltk.text import *
from nltk.probability import FreqDist
from nltk.util import tokenwrap

print "Content-type: text/html"
print

form = cgi.FieldStorage() 
keyword = form.getvalue('query')

print '''
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/> 
<script type="text/javascript" src="../jquery-1.2.6.min.js"></script>
<script type="text/javascript" src="../slideshows.js"></script>
<link rel="stylesheet" href="../style.css" /> 

<title> Tumblrsaurus </title> 
</head>

<body>
<div id="wrapper">
<div id="header"> 
	<div id="realdeal"> you looked for '''+str(keyword)+''' <br/>
	<br/>
	<img src="../arrow.png" alt="arrow"/>
	</div>
</div>
'''


#global
COUNTER = 1

results = []
searchdef = 'site:gutenberg.org '+str(keyword)
extensions = ("jpg","png","bmp","gif")

def bylength(word1, word2):
	return len(word1) - len(word2)

def displayKeywords():
	first = 1
	targetfolder = './images/'+keyword
	keyword_dir_list = os.listdir(targetfolder)
	print '''<div id="slideshow1">'''
	for url in keyword_dir_list:
		if first == 1:
			first = 0
			targetfolderurl = targetfolder+"/"+url
			print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" class="active"/>'
		else:
			targetfolderurl = targetfolder+"/"+url
			print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />'
	print '''</div>'''

def displayImages(similars):
	displayKeywords()
	counter2 = 2
	for sim in similars:
		print '''<div id="slideshow'''+str(counter2)+'''">'''
		counter2 += 1
		first = 1
		targetfolder = './images/'+sim
		keyword_dir_list = os.listdir(targetfolder)
		for url in keyword_dir_list:
			if first == 1:
				first = 0
				targetfolderurl = targetfolder+"/"+url
				print '<img src="../'+targetfolderurl+'" alt="'+sim+'" class="active"/>'
			else:
				targetfolderurl = targetfolder+"/"+url
				print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />'
		print ''' </div> '''
	print '''</div> </body> </html>'''


def tumblrDownloader(zoekwoord):
	pipe = "http://pipes.yahoo.com/pipes/pipe.run?_id=zIR_uOFQ3hGQCl8j6ycw5g&_render=rss&q="+zoekwoord
	feed = feedparser.parse(pipe)
	# We need a folder to store the images
	targetfolder = 'images/'+zoekwoord
	if not os.path.isdir(targetfolder):
		os.mkdir(targetfolder)
	#if the extension is null add jpg
	for e in feed.entries:
		words = e.description.split()
		for i in range(len(words)):
			if words[i].endswith("img"):
		 		pipe = words[i+1][5:-1]
		 		filename = pipe.split('/')[-1]
		 		for ext in extensions:
		 			os.chdir(targetfolder)
					if os.path.isfile(filename):
						os.chdir("../..")
#						print "file exists, skip"
					else:
#						print "downloading " + filename
						cmd = 'wget -q %s' %pipe
						cmd2 = 'chmod 755 %s' %filename
						cmd3 = 'mogrify -resize 250 %s' %filename
						os.system(cmd)
						os.system(cmd2)
			 			os.system(cmd3)
						os.chdir("../..")
			 		break

def hits(astr):
	for start in itertools.count():
		query = urllib.urlencode({'q':astr, 'rsz': 8, 'start': start*8})
		url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s'%(query)
		search_results = urllib.urlopen(url)
		results = json.loads(search_results.read())
		data = results['responseData']
		if data:
			hits = data['results']
			for h in hits:
				yield h['url']
		else:
			raise StopIteration


def similar(self, word, num):
	ret = []
	if '_word_context_index' not in self.__dict__: 
#		print 'Building word-context index...' 
		self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) 
	word = word.lower() 
	wci = self._word_context_index._word_to_contexts 
	if word in wci.conditions(): 
		contexts = set(wci[word]) 
		fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) 
		words = fd.keys()[:num] 
		return tokenwrap(words)
#	else:
#		print "No matches" 

def tumblrMachine(similars):
	tumblrDownloader(keyword)
	print similars
	for e in similars:
		tumblrDownloader(e)
	displayImages(similars)

def getText(url):
	similars = []
	global COUNTER
	filename = url.split('/')[-1]
#	print "parsing " + url
	raw = urlopen(url).read()
	tokens = nltk.word_tokenize(raw)
	text = nltk.Text(tokens)
	words = similar(text, keyword, 20)
	if words:
	##!!!##############!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
	#if len(words.split()) < 3 counter+1 showless
		if len(words.split()) > 4:
			for w in words.split():
				if len(w) > 3:
					similars.append(w)
			similars.sort(cmp=bylength)
			similars.reverse()
			similars = similars[0:4]
	#		print similars
			tumblrMachine(similars)
		else:
	#		print "no matches"
			COUNTER += 1
			showless()

def gutenbergClean(url):
	filename = url.split('/')[-1]
	if filename.endswith(".htm"):
		filename = filename.split('.')[0]
		gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.htm'
	else:
		filename = filename.split('.')[0]
		gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.txt'
	getText(gutenbergUrl)

def googleQuery(astr,num):
	os.chdir("..") 
	if keyword == "berlusconi":
		print "please, do not enter swear words."
	elif keyword == "kill" or keyword == "killer":
		print "don't be silly"
	else:
		for i,h in enumerate(itertools.islice(hits(astr),num)):
			results.append(h)
		showless()

def showless():
	global COUNTER
	if results:
		urlG = results[COUNTER]
		gutenbergClean(urlG)
	else:
		print "sorry, no matches"



googleQuery(searchdef,50)