User:Laura Macchini/project1: Difference between revisions
(Created page with "File:Naamloze presentatie.pdf Pecha Kucha presentation of my project") |
No edit summary |
||
Line 1: | Line 1: | ||
<code> | |||
#!/usr/bin/python | |||
import cgi, cgitb | |||
import json, urllib, sys, os, nltk, itertools, feedparser, urllib2, time, re | |||
from urllib import urlopen | |||
from nltk.text import * | |||
from nltk.probability import FreqDist | |||
from nltk.util import tokenwrap | |||
print "Content-type: text/html" | |||
print | |||
print ''' | |||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |||
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en"> ''' | |||
print '''<head> | |||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" /> | |||
<title> tumblrthesaurus </title> | |||
</head> ''' | |||
print '''<body>''' | |||
form = cgi.FieldStorage() | |||
keyword = form.getvalue('query') | |||
#global | |||
COUNTER = 1 | |||
results = [] | |||
searchdef = 'site:gutenberg.org '+keyword | |||
extensions = ("jpg","png","bmp","gif") | |||
def bylength(word1, word2): | |||
return len(word1) - len(word2) | |||
def displayKeywords(): | |||
# print "displayimages called" | |||
targetfolder = './images/'+keyword | |||
keyword_dir_list = os.listdir(targetfolder) | |||
for url in keyword_dir_list: | |||
targetfolderurl = targetfolder+"/"+url | |||
print '<img src="../'+targetfolderurl+'" alt="'+keyword+'" />' | |||
print "<br/>" | |||
print '''</body> </html>''' | |||
def displayImages(similars): | |||
displayKeywords() | |||
for sim in similars: | |||
print "**" | |||
targetfolder = './images/'+sim | |||
keyword_dir_list = os.listdir(targetfolder) | |||
for url in keyword_dir_list: | |||
targetfolderurl = targetfolder+"/"+url | |||
print '<img src="../'+targetfolderurl+'" alt="'+sim+'" />' | |||
print "<br/>" | |||
print '''</body> </html>''' | |||
def tumblrDownloader(zoekwoord): | |||
pipe = "http://pipes.yahoo.com/pipes/pipe.run?_id=zIR_uOFQ3hGQCl8j6ycw5g&_render=rss&q="+zoekwoord | |||
feed = feedparser.parse(pipe) | |||
# We need a folder to store the images | |||
targetfolder = 'images/'+zoekwoord | |||
if not os.path.isdir(targetfolder): | |||
os.mkdir(targetfolder) | |||
#if the extension is null add jpg | |||
for e in feed.entries: | |||
words = e.description.split() | |||
for i in range(len(words)): | |||
if words[i].endswith("img"): | |||
pipe = words[i+1][5:-1] | |||
filename = pipe.split('/')[-1] | |||
for ext in extensions: | |||
os.chdir(targetfolder) | |||
if os.path.isfile(filename): | |||
os.chdir("../..") | |||
# print "file exists, skip" | |||
else: | |||
# print "downloading " + filename | |||
cmd = 'wget -q %s' %pipe | |||
cmd2 = 'chmod 755 %s' %filename | |||
os.system(cmd) | |||
os.system(cmd2) | |||
os.chdir("../..") | |||
break | |||
def hits(astr): | |||
for start in itertools.count(): | |||
query = urllib.urlencode({'q':astr, 'rsz': 8, 'start': start*8}) | |||
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s'%(query) | |||
search_results = urllib.urlopen(url) | |||
results = json.loads(search_results.read()) | |||
data = results['responseData'] | |||
if data: | |||
hits = data['results'] | |||
for h in hits: | |||
yield h['url'] | |||
else: | |||
raise StopIteration | |||
def similar(self, word, num): | |||
ret = [] | |||
if '_word_context_index' not in self.__dict__: | |||
# print 'Building word-context index...' | |||
self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) | |||
word = word.lower() | |||
wci = self._word_context_index._word_to_contexts | |||
if word in wci.conditions(): | |||
contexts = set(wci[word]) | |||
fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) | |||
words = fd.keys()[:num] | |||
return tokenwrap(words) | |||
# else: | |||
# print "No matches" | |||
def tumblrMachine(similars): | |||
tumblrDownloader(keyword) | |||
for e in similars: | |||
tumblrDownloader(e) | |||
displayImages(similars) | |||
def getText(url): | |||
similars = [] | |||
global COUNTER | |||
filename = url.split('/')[-1] | |||
# print "parsing " + url | |||
raw = urlopen(url).read() | |||
tokens = nltk.word_tokenize(raw) | |||
text = nltk.Text(tokens) | |||
words = similar(text, keyword, 20) | |||
if words: | |||
for w in words.split(): | |||
if len(w) > 3: | |||
similars.append(w) | |||
similars.sort(cmp=bylength) | |||
similars.reverse() | |||
similars = similars[0:4] | |||
# print similars | |||
tumblrMachine(similars) | |||
else: | |||
# print "no matches" | |||
COUNTER += 1 | |||
showless() | |||
def gutenbergClean(url): | |||
filename = url.split('/')[-1] | |||
if filename.endswith(".htm"): | |||
filename = filename.split('.')[0] | |||
gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.htm' | |||
else: | |||
filename = filename.split('.')[0] | |||
gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.txt' | |||
getText(gutenbergUrl) | |||
def googleQuery(astr,num): | |||
os.chdir("..") | |||
# print "now we are in"+ os.getcwd() | |||
for i,h in enumerate(itertools.islice(hits(astr),num)): | |||
results.append(h) | |||
showless() | |||
def showless(): | |||
global COUNTER | |||
urlG = results[COUNTER] | |||
gutenbergClean(urlG) | |||
googleQuery(searchdef,50) | |||
</code> |
Revision as of 21:13, 15 December 2010
- !/usr/bin/python
import cgi, cgitb
import json, urllib, sys, os, nltk, itertools, feedparser, urllib2, time, re
from urllib import urlopen
from nltk.text import *
from nltk.probability import FreqDist
from nltk.util import tokenwrap
print "Content-type: text/html"
print
print
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en">
print <head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" />
<title> tumblrthesaurus </title>
</head>
print <body>
form = cgi.FieldStorage()
keyword = form.getvalue('query')
- global
COUNTER = 1
results = []
searchdef = 'site:gutenberg.org '+keyword
extensions = ("jpg","png","bmp","gif")
def bylength(word1, word2):
return len(word1) - len(word2)
def displayKeywords():
# print "displayimages called"
targetfolder = './images/'+keyword
keyword_dir_list = os.listdir(targetfolder)
for url in keyword_dir_list:
targetfolderurl = targetfolder+"/"+url
print ''
print "
"
print </body> </html>
def displayImages(similars):
displayKeywords()
for sim in similars:
print "**"
targetfolder = './images/'+sim
keyword_dir_list = os.listdir(targetfolder)
for url in keyword_dir_list:
targetfolderurl = targetfolder+"/"+url
print ''
print "
"
print </body> </html>
def tumblrDownloader(zoekwoord):
pipe = "http://pipes.yahoo.com/pipes/pipe.run?_id=zIR_uOFQ3hGQCl8j6ycw5g&_render=rss&q="+zoekwoord
feed = feedparser.parse(pipe)
# We need a folder to store the images
targetfolder = 'images/'+zoekwoord
if not os.path.isdir(targetfolder):
os.mkdir(targetfolder)
#if the extension is null add jpg
for e in feed.entries:
words = e.description.split()
for i in range(len(words)):
if words[i].endswith("img"):
pipe = words[i+1][5:-1]
filename = pipe.split('/')[-1]
for ext in extensions:
os.chdir(targetfolder)
if os.path.isfile(filename):
os.chdir("../..")
- print "file exists, skip"
else:
- print "downloading " + filename
cmd = 'wget -q %s' %pipe
cmd2 = 'chmod 755 %s' %filename
os.system(cmd)
os.system(cmd2)
os.chdir("../..")
break
def hits(astr):
for start in itertools.count():
query = urllib.urlencode({'q':astr, 'rsz': 8, 'start': start*8})
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s'%(query)
search_results = urllib.urlopen(url)
results = json.loads(search_results.read())
data = results['responseData']
if data:
hits = data['results']
for h in hits:
yield h['url']
else:
raise StopIteration
def similar(self, word, num):
ret = []
if '_word_context_index' not in self.__dict__:
- print 'Building word-context index...'
self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower())
word = word.lower()
wci = self._word_context_index._word_to_contexts
if word in wci.conditions():
contexts = set(wci[word])
fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word)
words = fd.keys()[:num]
return tokenwrap(words)
- else:
- print "No matches"
def tumblrMachine(similars):
tumblrDownloader(keyword)
for e in similars:
tumblrDownloader(e)
displayImages(similars)
def getText(url):
similars = []
global COUNTER
filename = url.split('/')[-1]
- print "parsing " + url
raw = urlopen(url).read()
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)
words = similar(text, keyword, 20)
if words:
for w in words.split():
if len(w) > 3:
similars.append(w)
similars.sort(cmp=bylength)
similars.reverse()
similars = similars[0:4]
- print similars
tumblrMachine(similars)
else:
- print "no matches"
COUNTER += 1
showless()
def gutenbergClean(url):
filename = url.split('/')[-1]
if filename.endswith(".htm"):
filename = filename.split('.')[0]
gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.htm'
else:
filename = filename.split('.')[0]
gutenbergUrl = 'http://www.gutenberg.org/files/'+filename+'/'+filename+'.txt'
getText(gutenbergUrl)
def googleQuery(astr,num):
os.chdir("..")
- print "now we are in"+ os.getcwd()
for i,h in enumerate(itertools.islice(hits(astr),num)):
results.append(h)
showless()
def showless():
global COUNTER
urlG = results[COUNTER]
gutenbergClean(urlG)
googleQuery(searchdef,50)