User:Laura Macchini/project1: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
No edit summary
Line 1: Line 1:
<code>
<source lang="python">
#!/usr/bin/python
#!/usr/bin/python
import cgi, cgitb  
import cgi, cgitb  
Line 170: Line 170:




</code>
</source>

Revision as of 21:14, 15 December 2010

#!/usr/bin/python
import cgi, cgitb 
import json, urllib, sys, os, nltk, itertools, feedparser, urllib2, time, re
from urllib import urlopen
from nltk.text import *
from nltk.probability import FreqDist
from nltk.util import tokenwrap

print "Content-type: text/html"
print

print '''
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en"> '''

print '''<head> 
	<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" /> 
	<title> tumblrthesaurus </title> 
</head> '''

print '''<body>'''
form = cgi.FieldStorage() 
keyword = form.getvalue('query')

#global
COUNTER = 1

results = []
searchdef = 'site:gutenberg.org '+keyword
extensions = ("jpg","png","bmp","gif")

def bylength(word1, word2):
	return len(word1) - len(word2)

def displayKeywords():
	#	print "displayimages called"
	targetfolder = './images/'+keyword
	keyword_dir_list = os.listdir(targetfolder)
	for url in keyword_dir_list:
		targetfolderurl = targetfolder+"/"+url
		print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />'
		print "<br/>"	
	print '''</body>  </html>'''

def displayImages(similars):
	displayKeywords()
	for sim in similars:
		print "**"
		targetfolder = './images/'+sim
		keyword_dir_list = os.listdir(targetfolder)
		for url in keyword_dir_list:
			targetfolderurl = targetfolder+"/"+url
			print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />'
			print "<br/>"	
		print '''</body>  </html>'''


def tumblrDownloader(zoekwoord):
	pipe = "http://pipes.yahoo.com/pipes/pipe.run?_id=zIR_uOFQ3hGQCl8j6ycw5g&_render=rss&q="+zoekwoord
	feed = feedparser.parse(pipe)
	# We need a folder to store the images
	targetfolder = 'images/'+zoekwoord
	if not os.path.isdir(targetfolder):
		os.mkdir(targetfolder)
	#if the extension is null add jpg
	for e in feed.entries:
		words = e.description.split()
		for i in range(len(words)):
			if words[i].endswith("img"):
		 		pipe = words[i+1][5:-1]
		 		filename = pipe.split('/')[-1]
		 		for ext in extensions:
		 			os.chdir(targetfolder)
					if os.path.isfile(filename):
						os.chdir("../..")
#						print "file exists, skip"
					else:
#						print "downloading " + filename
						cmd = 'wget -q %s' %pipe
						cmd2 = 'chmod 755 %s' %filename
						os.system(cmd)
						os.system(cmd2)
						os.chdir("../..")
			 		break

def hits(astr):
	for start in itertools.count():
		query = urllib.urlencode({'q':astr, 'rsz': 8, 'start': start*8})
		url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s'%(query)
		search_results = urllib.urlopen(url)
		results = json.loads(search_results.read())
		data = results['responseData']
		if data:
			hits = data['results']
			for h in hits:
				yield h['url']
		else:
			raise StopIteration


def similar(self, word, num):
	ret = []
	if '_word_context_index' not in self.__dict__: 
#		print 'Building word-context index...' 
		self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) 
	word = word.lower() 
	wci = self._word_context_index._word_to_contexts 
	if word in wci.conditions(): 
		contexts = set(wci[word]) 
		fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) 
		words = fd.keys()[:num] 
		return tokenwrap(words)
#	else:
#		print "No matches" 

def tumblrMachine(similars):
	tumblrDownloader(keyword)
	for e in similars:
		tumblrDownloader(e)
	displayImages(similars)

def getText(url):
	similars = []
	global COUNTER
	filename = url.split('/')[-1]
#	print "parsing " + url
	raw = urlopen(url).read()
	tokens = nltk.word_tokenize(raw)
	text = nltk.Text(tokens)
	words = similar(text, keyword, 20)
	if words:
		for w in words.split():
			if len(w) > 3:
				similars.append(w)
		similars.sort(cmp=bylength)
		similars.reverse()
		similars = similars[0:4]
#		print similars
		tumblrMachine(similars)
	else:
#		print "no matches"
		COUNTER += 1
		showless()

def gutenbergClean(url):
	filename = url.split('/')[-1]
	if filename.endswith(".htm"):
		filename = filename.split('.')[0]
		gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.htm'
	else:
		filename = filename.split('.')[0]
		gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.txt'
	getText(gutenbergUrl)

def googleQuery(astr,num):
	os.chdir("..")
#	print "now we are in"+ os.getcwd()
	for i,h in enumerate(itertools.islice(hits(astr),num)):
		results.append(h)
	showless()

def showless():
	global COUNTER
	urlG = results[COUNTER]
	gutenbergClean(urlG)

googleQuery(searchdef,50)