User:Laura Macchini/project1: Difference between revisions

From XPUB & Lens-Based wiki
(Created page with "File:Naamloze presentatie.pdf Pecha Kucha presentation of my project")
 
No edit summary
 
(9 intermediate revisions by the same user not shown)
Line 1: Line 1:
[[File:Naamloze presentatie.pdf]]
Tumblsaurus is a web application that allows to browse tumblr in a semantic-associative way.
Given a keyword, it will look up four related words (synonims or words used in the same context) and display the latest tumblr image results for that topic.
It is possible to browse the results by clicking on the images or typing another keyword.


Pecha Kucha presentation of my project
===Video Documentation===
<video src="http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/media/tm.ogv" controls="controls"></video>
 
 
===Link to the Rebelhuis website===
[[http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/index.html  go!]]
 
==Code Dump==
<source lang=python>
#!/usr/bin/python
import cgi, cgitb
import json, urllib, sys, os, nltk, itertools, feedparser, urllib2, time, re
from urllib import urlopen
from nltk.text import *
from nltk.probability import FreqDist
from nltk.util import tokenwrap
 
print "Content-type: text/html"
print
 
form = cgi.FieldStorage()
keyword = form.getvalue('query')
 
print '''
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>
<script type="text/javascript" src="../jquery-1.2.6.min.js"></script>
<script type="text/javascript" src="../slideshows.js"></script>
<link rel="stylesheet" href="../style.css" />
 
<title> Tumblrsaurus </title>
</head>
 
<body>
<div id="wrapper">
<div id="header">
<div id="realdeal"> you looked for '''+str(keyword)+''' <br/>
<br/>
<img src="../arrow.png" alt="arrow"/>
</div>
</div>
'''
 
 
#global
COUNTER = 1
 
results = []
searchdef = 'site:gutenberg.org '+str(keyword)
extensions = ("jpg","png","bmp","gif")
 
def bylength(word1, word2):
return len(word1) - len(word2)
 
def displayKeywords():
first = 1
targetfolder = './images/'+keyword
keyword_dir_list = os.listdir(targetfolder)
print '''<div id="slideshow1">'''
for url in keyword_dir_list:
if first == 1:
first = 0
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" class="active"/>'
else:
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />'
print '''</div>'''
 
def displayImages(similars):
displayKeywords()
counter2 = 2
for sim in similars:
print '''<div id="slideshow'''+str(counter2)+'''">'''
counter2 += 1
first = 1
targetfolder = './images/'+sim
keyword_dir_list = os.listdir(targetfolder)
for url in keyword_dir_list:
if first == 1:
first = 0
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" class="active"/>'
else:
targetfolderurl = targetfolder+"/"+url
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />'
print ''' </div> '''
print '''</div> </body> </html>'''
 
 
def tumblrDownloader(zoekwoord):
pipe = "http://pipes.yahoo.com/pipes/pipe.run?_id=zIR_uOFQ3hGQCl8j6ycw5g&_render=rss&q="+zoekwoord
feed = feedparser.parse(pipe)
# We need a folder to store the images
targetfolder = 'images/'+zoekwoord
if not os.path.isdir(targetfolder):
os.mkdir(targetfolder)
#if the extension is null add jpg
for e in feed.entries:
words = e.description.split()
for i in range(len(words)):
if words[i].endswith("img"):
pipe = words[i+1][5:-1]
filename = pipe.split('/')[-1]
for ext in extensions:
os.chdir(targetfolder)
if os.path.isfile(filename):
os.chdir("../..")
# print "file exists, skip"
else:
# print "downloading " + filename
cmd = 'wget -q %s' %pipe
cmd2 = 'chmod 755 %s' %filename
cmd3 = 'mogrify -resize 250 %s' %filename
os.system(cmd)
os.system(cmd2)
os.system(cmd3)
os.chdir("../..")
break
 
def hits(astr):
for start in itertools.count():
query = urllib.urlencode({'q':astr, 'rsz': 8, 'start': start*8})
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s'%(query)
search_results = urllib.urlopen(url)
results = json.loads(search_results.read())
data = results['responseData']
if data:
hits = data['results']
for h in hits:
yield h['url']
else:
raise StopIteration
 
 
def similar(self, word, num):
ret = []
if '_word_context_index' not in self.__dict__:
# print 'Building word-context index...'
self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower())
word = word.lower()
wci = self._word_context_index._word_to_contexts
if word in wci.conditions():
contexts = set(wci[word])
fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word)
words = fd.keys()[:num]
return tokenwrap(words)
# else:
# print "No matches"
 
def tumblrMachine(similars):
tumblrDownloader(keyword)
print similars
for e in similars:
tumblrDownloader(e)
displayImages(similars)
 
def getText(url):
similars = []
global COUNTER
filename = url.split('/')[-1]
# print "parsing " + url
raw = urlopen(url).read()
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)
words = similar(text, keyword, 20)
if words:
##!!!##############!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#if len(words.split()) < 3 counter+1 showless
if len(words.split()) > 4:
for w in words.split():
if len(w) > 3:
similars.append(w)
similars.sort(cmp=bylength)
similars.reverse()
similars = similars[0:4]
# print similars
tumblrMachine(similars)
else:
# print "no matches"
COUNTER += 1
showless()
 
def gutenbergClean(url):
filename = url.split('/')[-1]
if filename.endswith(".htm"):
filename = filename.split('.')[0]
gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.htm'
else:
filename = filename.split('.')[0]
gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.txt'
getText(gutenbergUrl)
 
def googleQuery(astr,num):
os.chdir("..")
if keyword == "berlusconi":
print "please, do not enter swear words."
elif keyword == "kill" or keyword == "killer":
print "don't be silly"
else:
for i,h in enumerate(itertools.islice(hits(astr),num)):
results.append(h)
showless()
 
def showless():
global COUNTER
if results:
urlG = results[COUNTER]
gutenbergClean(urlG)
else:
print "sorry, no matches"
 
 
 
googleQuery(searchdef,50)
 
 
</source>

Latest revision as of 11:31, 24 May 2011

Tumblsaurus is a web application that allows to browse tumblr in a semantic-associative way. Given a keyword, it will look up four related words (synonims or words used in the same context) and display the latest tumblr image results for that topic. It is possible to browse the results by clicking on the images or typing another keyword.

Video Documentation

<video src="http://pzwart3.wdka.hro.nl/rebelhuis/student/laura/media/tm.ogv" controls="controls"></video>


Link to the Rebelhuis website

[go!]

Code Dump

#!/usr/bin/python
import cgi, cgitb 
import json, urllib, sys, os, nltk, itertools, feedparser, urllib2, time, re
from urllib import urlopen
from nltk.text import *
from nltk.probability import FreqDist
from nltk.util import tokenwrap

print "Content-type: text/html"
print

form = cgi.FieldStorage() 
keyword = form.getvalue('query')

print '''
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/> 
<script type="text/javascript" src="../jquery-1.2.6.min.js"></script>
<script type="text/javascript" src="../slideshows.js"></script>
<link rel="stylesheet" href="../style.css" /> 

<title> Tumblrsaurus </title> 
</head>

<body>
<div id="wrapper">
<div id="header"> 
	<div id="realdeal"> you looked for '''+str(keyword)+''' <br/>
	<br/>
	<img src="../arrow.png" alt="arrow"/>
	</div>
</div>
'''


#global
COUNTER = 1

results = []
searchdef = 'site:gutenberg.org '+str(keyword)
extensions = ("jpg","png","bmp","gif")

def bylength(word1, word2):
	return len(word1) - len(word2)

def displayKeywords():
	first = 1
	targetfolder = './images/'+keyword
	keyword_dir_list = os.listdir(targetfolder)
	print '''<div id="slideshow1">'''
	for url in keyword_dir_list:
		if first == 1:
			first = 0
			targetfolderurl = targetfolder+"/"+url
			print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" class="active"/>'
		else:
			targetfolderurl = targetfolder+"/"+url
			print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />'
	print '''</div>'''

def displayImages(similars):
	displayKeywords()
	counter2 = 2
	for sim in similars:
		print '''<div id="slideshow'''+str(counter2)+'''">'''
		counter2 += 1
		first = 1
		targetfolder = './images/'+sim
		keyword_dir_list = os.listdir(targetfolder)
		for url in keyword_dir_list:
			if first == 1:
				first = 0
				targetfolderurl = targetfolder+"/"+url
				print '<img src="../'+targetfolderurl+'" alt="'+sim+'" class="active"/>'
			else:
				targetfolderurl = targetfolder+"/"+url
				print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />'
		print ''' </div> '''
	print '''</div> </body> </html>'''


def tumblrDownloader(zoekwoord):
	pipe = "http://pipes.yahoo.com/pipes/pipe.run?_id=zIR_uOFQ3hGQCl8j6ycw5g&_render=rss&q="+zoekwoord
	feed = feedparser.parse(pipe)
	# We need a folder to store the images
	targetfolder = 'images/'+zoekwoord
	if not os.path.isdir(targetfolder):
		os.mkdir(targetfolder)
	#if the extension is null add jpg
	for e in feed.entries:
		words = e.description.split()
		for i in range(len(words)):
			if words[i].endswith("img"):
		 		pipe = words[i+1][5:-1]
		 		filename = pipe.split('/')[-1]
		 		for ext in extensions:
		 			os.chdir(targetfolder)
					if os.path.isfile(filename):
						os.chdir("../..")
#						print "file exists, skip"
					else:
#						print "downloading " + filename
						cmd = 'wget -q %s' %pipe
						cmd2 = 'chmod 755 %s' %filename
						cmd3 = 'mogrify -resize 250 %s' %filename
						os.system(cmd)
						os.system(cmd2)
			 			os.system(cmd3)
						os.chdir("../..")
			 		break

def hits(astr):
	for start in itertools.count():
		query = urllib.urlencode({'q':astr, 'rsz': 8, 'start': start*8})
		url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s'%(query)
		search_results = urllib.urlopen(url)
		results = json.loads(search_results.read())
		data = results['responseData']
		if data:
			hits = data['results']
			for h in hits:
				yield h['url']
		else:
			raise StopIteration


def similar(self, word, num):
	ret = []
	if '_word_context_index' not in self.__dict__: 
#		print 'Building word-context index...' 
		self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) 
	word = word.lower() 
	wci = self._word_context_index._word_to_contexts 
	if word in wci.conditions(): 
		contexts = set(wci[word]) 
		fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) 
		words = fd.keys()[:num] 
		return tokenwrap(words)
#	else:
#		print "No matches" 

def tumblrMachine(similars):
	tumblrDownloader(keyword)
	print similars
	for e in similars:
		tumblrDownloader(e)
	displayImages(similars)

def getText(url):
	similars = []
	global COUNTER
	filename = url.split('/')[-1]
#	print "parsing " + url
	raw = urlopen(url).read()
	tokens = nltk.word_tokenize(raw)
	text = nltk.Text(tokens)
	words = similar(text, keyword, 20)
	if words:
	##!!!##############!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
	#if len(words.split()) < 3 counter+1 showless
		if len(words.split()) > 4:
			for w in words.split():
				if len(w) > 3:
					similars.append(w)
			similars.sort(cmp=bylength)
			similars.reverse()
			similars = similars[0:4]
	#		print similars
			tumblrMachine(similars)
		else:
	#		print "no matches"
			COUNTER += 1
			showless()

def gutenbergClean(url):
	filename = url.split('/')[-1]
	if filename.endswith(".htm"):
		filename = filename.split('.')[0]
		gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.htm'
	else:
		filename = filename.split('.')[0]
		gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.txt'
	getText(gutenbergUrl)

def googleQuery(astr,num):
	os.chdir("..") 
	if keyword == "berlusconi":
		print "please, do not enter swear words."
	elif keyword == "kill" or keyword == "killer":
		print "don't be silly"
	else:
		for i,h in enumerate(itertools.islice(hits(astr),num)):
			results.append(h)
		showless()

def showless():
	global COUNTER
	if results:
		urlG = results[COUNTER]
		gutenbergClean(urlG)
	else:
		print "sorry, no matches"



googleQuery(searchdef,50)