User:Laura Macchini/project1: Difference between revisions
No edit summary |
|||
Line 1: | Line 1: | ||
==Code Dump== | ==Code Dump== | ||
#!/usr/bin/python | #!/usr/bin/python | ||
import cgi, cgitb | import cgi, cgitb | ||
Line 12: | Line 11: | ||
print "Content-type: text/html" | print "Content-type: text/html" | ||
print | print | ||
form = cgi.FieldStorage() | |||
keyword = form.getvalue('query') | |||
print ''' | print ''' | ||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | ||
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en"> | <html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en"> | ||
<head> | |||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/> | |||
<script type="text/javascript" src="../jquery-1.2.6.min.js"></script> | |||
<script type="text/javascript" src="../slideshows.js"></script> | |||
<link rel="stylesheet" href="../style.css" /> | |||
<title> Tumblrsaurus </title> | |||
</head> | |||
<body> | |||
< | <div id="wrapper"> | ||
<div id="header"> | |||
</ | <div id="realdeal"> you looked for '''+str(keyword)+''' <br/> | ||
<br/> | |||
<img src="../arrow.png" alt="arrow"/> | |||
</div> | |||
</div> | |||
''' | |||
#global | #global | ||
Line 30: | Line 42: | ||
results = [] | results = [] | ||
searchdef = 'site:gutenberg.org '+keyword | searchdef = 'site:gutenberg.org '+str(keyword) | ||
extensions = ("jpg","png","bmp","gif") | extensions = ("jpg","png","bmp","gif") | ||
Line 37: | Line 49: | ||
def displayKeywords(): | def displayKeywords(): | ||
first = 1 | |||
targetfolder = './images/'+keyword | targetfolder = './images/'+keyword | ||
keyword_dir_list = os.listdir(targetfolder) | keyword_dir_list = os.listdir(targetfolder) | ||
print '''<div id="slideshow1">''' | |||
for url in keyword_dir_list: | for url in keyword_dir_list: | ||
targetfolderurl = targetfolder+"/"+url | if first == 1: | ||
first = 0 | |||
print " | targetfolderurl = targetfolder+"/"+url | ||
print '''</ | print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" class="active"/>' | ||
else: | |||
targetfolderurl = targetfolder+"/"+url | |||
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />' | |||
print '''</div>''' | |||
def displayImages(similars): | def displayImages(similars): | ||
displayKeywords() | displayKeywords() | ||
counter2 = 2 | |||
for sim in similars: | for sim in similars: | ||
print '''<div id="slideshow'''+str(counter2)+'''">''' | |||
counter2 += 1 | |||
first = 1 | |||
targetfolder = './images/'+sim | targetfolder = './images/'+sim | ||
keyword_dir_list = os.listdir(targetfolder) | keyword_dir_list = os.listdir(targetfolder) | ||
for url in keyword_dir_list: | for url in keyword_dir_list: | ||
targetfolderurl = targetfolder+"/"+url | if first == 1: | ||
first = 0 | |||
print "< | targetfolderurl = targetfolder+"/"+url | ||
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" class="active"/>' | |||
else: | |||
targetfolderurl = targetfolder+"/"+url | |||
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />' | |||
print ''' </div> ''' | |||
print '''</div> </body> </html>''' | |||
Line 83: | Line 107: | ||
cmd = 'wget -q %s' %pipe | cmd = 'wget -q %s' %pipe | ||
cmd2 = 'chmod 755 %s' %filename | cmd2 = 'chmod 755 %s' %filename | ||
cmd3 = 'mogrify -resize | cmd3 = 'mogrify -resize 250 %s' %filename | ||
os.system(cmd) | os.system(cmd) | ||
os.system(cmd2) | os.system(cmd2) | ||
Line 122: | Line 146: | ||
def tumblrMachine(similars): | def tumblrMachine(similars): | ||
tumblrDownloader(keyword) | tumblrDownloader(keyword) | ||
print similars | |||
for e in similars: | for e in similars: | ||
tumblrDownloader(e) | tumblrDownloader(e) | ||
Line 166: | Line 191: | ||
if keyword == "berlusconi": | if keyword == "berlusconi": | ||
print "please, do not enter swear words." | print "please, do not enter swear words." | ||
elif keyword == "kill" or keyword == "killer": | |||
print "don't be silly" | |||
else: | else: | ||
for i,h in enumerate(itertools.islice(hits(astr),num)): | for i,h in enumerate(itertools.islice(hits(astr),num)): | ||
Line 182: | Line 209: | ||
googleQuery(searchdef,50) | googleQuery(searchdef,50) | ||
</source> | </source> |
Revision as of 02:44, 17 December 2010
Code Dump
- !/usr/bin/python
import cgi, cgitb import json, urllib, sys, os, nltk, itertools, feedparser, urllib2, time, re from urllib import urlopen from nltk.text import * from nltk.probability import FreqDist from nltk.util import tokenwrap
print "Content-type: text/html" print
form = cgi.FieldStorage() keyword = form.getvalue('query')
print <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en"> <head> <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/> <script type="text/javascript" src="../jquery-1.2.6.min.js"></script> <script type="text/javascript" src="../slideshows.js"></script> <link rel="stylesheet" href="../style.css" />
<title> Tumblrsaurus </title> </head>
<body>
- global
COUNTER = 1
results = [] searchdef = 'site:gutenberg.org '+str(keyword) extensions = ("jpg","png","bmp","gif")
def bylength(word1, word2): return len(word1) - len(word2)
def displayKeywords(): first = 1 targetfolder = './images/'+keyword keyword_dir_list = os.listdir(targetfolder)
printfor url in keyword_dir_list: if first == 1: first = 0 targetfolderurl = targetfolder+"/"+url print '' else: targetfolderurl = targetfolder+"/"+url print ''
printdef displayImages(similars): displayKeywords() counter2 = 2 for sim in similars:
printcounter2 += 1 first = 1 targetfolder = './images/'+sim keyword_dir_list = os.listdir(targetfolder) for url in keyword_dir_list: if first == 1: first = 0 targetfolderurl = targetfolder+"/"+url print '' else: targetfolderurl = targetfolder+"/"+url print ''
print</body> </html>
def tumblrDownloader(zoekwoord):
pipe = "http://pipes.yahoo.com/pipes/pipe.run?_id=zIR_uOFQ3hGQCl8j6ycw5g&_render=rss&q="+zoekwoord
feed = feedparser.parse(pipe)
# We need a folder to store the images
targetfolder = 'images/'+zoekwoord
if not os.path.isdir(targetfolder):
os.mkdir(targetfolder)
#if the extension is null add jpg
for e in feed.entries:
words = e.description.split()
for i in range(len(words)):
if words[i].endswith("img"):
pipe = words[i+1][5:-1]
filename = pipe.split('/')[-1]
for ext in extensions:
os.chdir(targetfolder)
if os.path.isfile(filename):
os.chdir("../..")
- print "file exists, skip"
else:
- print "downloading " + filename
cmd = 'wget -q %s' %pipe cmd2 = 'chmod 755 %s' %filename cmd3 = 'mogrify -resize 250 %s' %filename os.system(cmd) os.system(cmd2) os.system(cmd3) os.chdir("../..") break
def hits(astr): for start in itertools.count(): query = urllib.urlencode({'q':astr, 'rsz': 8, 'start': start*8}) url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s'%(query) search_results = urllib.urlopen(url) results = json.loads(search_results.read()) data = results['responseData'] if data: hits = data['results'] for h in hits: yield h['url'] else: raise StopIteration
def similar(self, word, num):
ret = []
if '_word_context_index' not in self.__dict__:
- print 'Building word-context index...'
self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) word = word.lower() wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = fd.keys()[:num] return tokenwrap(words)
- else:
- print "No matches"
def tumblrMachine(similars): tumblrDownloader(keyword) print similars for e in similars: tumblrDownloader(e) displayImages(similars)
def getText(url): similars = [] global COUNTER filename = url.split('/')[-1]
- print "parsing " + url
raw = urlopen(url).read() tokens = nltk.word_tokenize(raw) text = nltk.Text(tokens) words = similar(text, keyword, 20) if words: ##!!!##############!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #if len(words.split()) < 3 counter+1 showless if len(words.split()) > 4: for w in words.split(): if len(w) > 3: similars.append(w) similars.sort(cmp=bylength) similars.reverse() similars = similars[0:4] # print similars tumblrMachine(similars) else: # print "no matches" COUNTER += 1 showless()
def gutenbergClean(url): filename = url.split('/')[-1] if filename.endswith(".htm"): filename = filename.split('.')[0] gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.htm' else: filename = filename.split('.')[0] gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.txt' getText(gutenbergUrl)
def googleQuery(astr,num): os.chdir("..") if keyword == "berlusconi": print "please, do not enter swear words." elif keyword == "kill" or keyword == "killer": print "don't be silly" else: for i,h in enumerate(itertools.islice(hits(astr),num)): results.append(h) showless()
def showless(): global COUNTER if results: urlG = results[COUNTER] gutenbergClean(urlG) else: print "sorry, no matches"
googleQuery(searchdef,50)
</source>