User:Emanuele Bonetti/ProblemSet2.3

From XPUB & Lens-Based wiki
< User:Emanuele Bonetti
Revision as of 20:34, 23 September 2010 by Migratebot (talk | contribs) (Created page with "Work Cloud - comment out the three different section to have three different work cloud <source lang="text"> import codecs import re from pprint import pprint import random...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

Work Cloud - comment out the three different section to have three different work cloud

import codecs
import re
from pprint import pprint
import random

t = codecs.open("redcircle2.txt","r","utf-8").read() #return a file object and read the contents
#print t

words = re.findall(r"\b[a-z'-]+\b",t, re.I) #this pattern match each word (one or more character from a to z or ' or -) - re.I -> ingnore capital case

freq={}

for w in words:
	if w not in freq:
		freq[w]=1
	else:
		freq[w]=freq[w]+1
	
print """
	<head>
<style type="text/css">
body {text-transform:uppercase;
font-size:56px;
font-family:"Courier New", Courier, monospace;
padding:20px;
}

</style>
</head>"""

print """<p style=\"z/index=100; color:red; position:fixed; top:20px;right:20px;font-size:20px; text-align:right;vertical-align:top; text-decoration:underline;\"><strong>The Adventure <br />of the red circle</strong><br />Sir A. Conan Doyle<br /><br /><span style=\"font-size:12px;\"><a href="http://www.gutenberg.org/files/2345/2345.txt">original source</a></span></p>"""


#------------------------------------------------------
# Section to print the table


print "<table>"
print "<tr><td style=\"font-size:20px; text-align:right;vertical-align:top; text-decoration:underline;\">Words appearing more than once<br /></td><td></td></tr>"
singlewords=[]
for element in freq:
	if freq[element]>1:
		graph=""
		for n in range(freq[element]):
			graph = graph+'I'
		print "<tr>"
		print "<td style=\"font-size:12px;text-align:right;vertical-align:top;\">"+element+" |</td><td style=\"font-size:12px;vertical-align:top;color:red;\">"+graph+"</td>"
		print "</tr>"
	else:
		singlewords.append(element)
print "<tr><td style=\"font-size:12px;text-align:right;vertical-align:top; text-decoration:underline\"><br />other words in the text</td><td style=\"font-size:12px;\"><br />"
for word in singlewords:
	print word+" |"		
print "</td></tr>"
print "</table>"


#-------------------------------------------------------------------------------



max_value=0
smallest_size=8
biggest_size=100

for word in freq:
	if freq[word]>max_value:
		max_value=freq[word]

list_of_freq=[]
	
l=[]
for w,n in freq.items():
	l.append((n,w))

l.sort()
l.reverse()

ll={}
cl=[]
for n,w in l:
	#print n,w
	if n not in ll:
		ll[n]=[w]
	else:
		ll[n].append(w)	

#pprint(ll)
numbs=ll.keys()
numbs.sort()
numbs.reverse()

#----------------------------------------------------------------------------
#section to print the first tag cloud

#print numbs
c=900
i=100
for n in numbs:
	font_size=(smallest_size+((biggest_size-smallest_size)/(max_value-1))*n)
	h=400-font_size*(len(ll[n])/2)
	words=ll[n]	
	words.sort()
	#print n
	for w in words:
		print "<span style=\"font-size:"+ str(font_size) +"px; position:fixed; top:"+str(h)+"px;left:"+str(c)+"px;\">"+w+"</span>"
		h=h+font_size
	c=c-i
	#i=i+50
	

#---------------------------------------------------------------
#Section to print the second tag cloud


x0=0
x1=900
y0=0	
y1=600	

c=0
for n in numbs:
	font_size=(smallest_size+((biggest_size-smallest_size)/(max_value-1))*n)	
	words=ll[n]	
	words.sort()
	#print n
	for w in words:
		print "<span style=\"font-size:"+ str(font_size) +"px; position:fixed; top:"+str(random.randint(y0,y1))+"px;left:"+str(random.randint(x0,x1))+"px;\">"+w+"</span>"
	
	c=c+1
	"""
	y0=y0-50*c
	#y1=y1+20
	x0=x0-50*c
	#x1=x1+20
	"""

#---------------------------------------------------------------


Graph


import codecs
import re
from pprint import pprint
import random
import pygraphviz

t = codecs.open("redcircle2.txt","r","utf-8").read() #return a file object and read the contents
#print t

words = re.findall(r"\b[a-z'-]+\b",t, re.I) #this pattern match each word (one or more character from a to z or ' or -) - re.I -> ingnore capital case
#print words


pairs = []
for (i, w) in enumerate(words):
    if i+1<len(words):
        nextword = words[i+1]
        pairs.append((w, nextword))

graph={}
list_of_words=[]
for (i,w) in pairs:
	if i not in graph:
		graph[i]=[w]
	else:
		if graph[i]!=[w]:
			graph[i].append(w)


g=pygraphviz.AGraph(directed=True)
for w in graph:
	for f in graph[w]:
		g.add_edge(w,f, color='red')
		
g.draw('graph0.png', prog="circo")


Attachments