16-03-2011 Laura Amy Laurier: Difference between revisions
Amy Suo Wu (talk | contribs) No edit summary |
No edit summary |
||
Line 19: | Line 19: | ||
english_stops = set(stopwords.words("english")) | english_stops = set(stopwords.words("english")) | ||
urls = [ | |||
'http://goodiff.org/browser/europoker/www.europoker.net/en/Text/TermsOfService', | |||
'http://goodiff.org/browser/europoker/www.europoker.net/en/Text/TermsOfService'] | |||
lists = '' | |||
for url in urls: | |||
t = urllib2.urlopen(url) | |||
c = t.read() | |||
c = lxml.html.clean.clean_html(c) | |||
c = nltk.util.clean_html(c) | |||
q = c.split() | |||
q = [w for w in q if w.lower() not in english_stops and w.isalpha()] | |||
fdist = FreqDist(q) | |||
voc = fdist.keys() | |||
for v in voc[:20]: | |||
lists = lists +' '+ v.lower() | |||
lists = lists + '\n\n' | |||
print lists | |||
</source> | </source> | ||
Revision as of 14:26, 16 March 2011
Simple Statistics
Legal terminology used in terms and conditions policies are often ambiguous and arbitrary. We want to highlight this ambiguity by showing the incongruity of definitions dependent on other factors often not implicitly explained. e.g the limits actually meant when the word 'unlimited' is used is paradoxical and often to give a false sense of comfort/security/complacency..
Word frequency distribution
from nltk import FreqDist
import urllib2
import nltk.util
import lxml.html.clean
from nltk.corpus import stopwords
english_stops = set(stopwords.words("english"))
urls = [
'http://goodiff.org/browser/europoker/www.europoker.net/en/Text/TermsOfService',
'http://goodiff.org/browser/europoker/www.europoker.net/en/Text/TermsOfService']
lists = ''
for url in urls:
t = urllib2.urlopen(url)
c = t.read()
c = lxml.html.clean.clean_html(c)
c = nltk.util.clean_html(c)
q = c.split()
q = [w for w in q if w.lower() not in english_stops and w.isalpha()]
fdist = FreqDist(q)
voc = fdist.keys()
for v in voc[:20]:
lists = lists +' '+ v.lower()
lists = lists + '\n\n'
print lists
from nltk import FreqDist
from matplotlib import *
import urllib2
from nltk.corpus import stopwords
n
english_stops = set(stopwords.words("english"))
t = "** 20.1 ** SITE shall not be responsible for any failure to perform due to unforeseen circumstances or to causes beyond our reasonable control, including but not limited to: acts of God, such as fire, flood, earthquakes, hurricanes, tropical storms or other natural disasters; war, riot, arson, embargoes, acts of civil or military authority, or terrorism; fiber cuts; strikes, or shortages in transportation, facilities, fuel, energy, labor or materials; failure of the telecommunications or information services infrastructure; hacking, SPAM, or any failure of a computer, server or software, including Y2K errors or omissions, for so long as such event continues to delay the SITE's performance. "
words = t.split()
words = [w for w in words if w not in english_stops]
fdist = FreqDist(words)
voc = fdist.keys()
print voc[:10]