16-03-2011 Laura Amy Laurier: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
No edit summary
Line 16: Line 16:
import lxml.html.clean
import lxml.html.clean
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.corpus import wordnet


english_stops = set(stopwords.words("english"))
english_stops = set(stopwords.words("english"))
Line 21: Line 22:
urls = [
urls = [
'http://goodiff.org/browser/europoker/www.europoker.net/en/Text/TermsOfService',
'http://goodiff.org/browser/europoker/www.europoker.net/en/Text/TermsOfService',
'http://goodiff.org/browser/europoker/www.europoker.net/en/Text/TermsOfService']
'http://goodiff.org/browser/paypal/www.paypal.com/fr/cgi-bin/webscr%3Fcmd%3Dp/gen/ua/policy_privacy-outside']


lists = ''
waystowrite = ''


#for each URL to scrape
for url in urls:
for url in urls:
 
print '**************************************************** SCRAPING URL'
t = urllib2.urlopen(url)
t = urllib2.urlopen(url)


Line 36: Line 38:
q = [w for w in q if w.lower() not in english_stops and w.isalpha()]
q = [w for w in q if w.lower() not in english_stops and w.isalpha()]
fdist = FreqDist(q)
fdist = FreqDist(q)
voc = fdist.keys()  
#get the most used word in the TOS
voc = fdist.keys()
print 'most used word in this TOS : '+voc[0]
print
 
#from all possible meanings of this word, grab the first one
meanings = wordnet.synsets(voc[0])
s = meanings[0]
print 'first meaning of the most popular word in this TOS : '+str (s)+'('+s.definition+')'
print


for v in voc[:20]:
#for that meaning, get all lemmas (all ways to write this meaning)
lists = lists +' '+ v.lower()
var = [[x.name for x in s.lemmas] for s in wordnet.synsets(voc[0])]
lists = lists + '\n\n'
for s in var:
for w in s:
print 'lemma : '+ str (w)
print


print lists
</source>
</source>



Revision as of 16:47, 16 March 2011

Simple Statistics

Legal terminology used in terms and conditions policies are often ambiguous and arbitrary. We want to highlight this ambiguity by showing the incongruity of definitions dependent on other factors often not implicitly explained. e.g the limits actually meant when the word 'unlimited' is used is paradoxical and often to give a false sense of comfort/security/complacency..


Bat.JPG


Word frequency distribution

from nltk import FreqDist
import urllib2
import nltk.util
import lxml.html.clean
from nltk.corpus import stopwords
from nltk.corpus import wordnet

english_stops = set(stopwords.words("english"))

urls = [
'http://goodiff.org/browser/europoker/www.europoker.net/en/Text/TermsOfService',
'http://goodiff.org/browser/paypal/www.paypal.com/fr/cgi-bin/webscr%3Fcmd%3Dp/gen/ua/policy_privacy-outside']

waystowrite = ''

#for each URL to scrape
for url in urls:
	print '**************************************************** SCRAPING URL'
	t = urllib2.urlopen(url)

	c = t.read()
	c = lxml.html.clean.clean_html(c)
	c = nltk.util.clean_html(c)

	q = c.split()
	q = [w for w in q if w.lower() not in english_stops and w.isalpha()]
	fdist = FreqDist(q)
	#get the most used word in the TOS
	voc = fdist.keys()
	print 'most used word in this TOS : '+voc[0]
	print

	#from all possible meanings of this word, grab the first one
	meanings = wordnet.synsets(voc[0])
	s = meanings[0]
	print 'first meaning of the most popular word in this TOS : '+str (s)+'('+s.definition+')'
	print

	#for that meaning, get all lemmas (all ways to write this meaning)
	var = [[x.name for x in s.lemmas] for s in wordnet.synsets(voc[0])]
	for s in var:
		for w in s:
			print 'lemma : '+ str (w)
		print
from nltk import FreqDist
from matplotlib import *
import urllib2
from nltk.corpus import stopwords
n
english_stops = set(stopwords.words("english"))

 
t = "** 20.1 ** SITE shall not be responsible for any failure to perform due to unforeseen circumstances or to causes beyond our reasonable control, including but not limited to: acts of God, such as fire, flood, earthquakes, hurricanes, tropical storms or other natural disasters; war, riot, arson, embargoes, acts of civil or military authority, or terrorism; fiber cuts; strikes, or shortages in transportation, facilities, fuel, energy, labor or materials; failure of the telecommunications or information services infrastructure; hacking, SPAM, or any failure of a computer, server or software, including Y2K errors or omissions, for so long as such event continues to delay the SITE's performance.&nbsp; &nbsp;"
 
words = t.split()
words = [w for w in words if w not in english_stops]
fdist = FreqDist(words)
 
voc = fdist.keys() 
 
print voc[:10]


Search Results

word strings according to frequency occurrence


site may terms use service agree materials information shall without third access services contained including right name party using otherwise

google information privacy may personal policy services use access provide cookies data process sites associated including security information application certain

information google privacy products center web privacy use chrome faq goodiff help provide advertising history home policy revision service toolkit

information information personal postrank may use site privacy us policy cookies collect contact data privacy provide third ip browser identifiable

postrank use otherwise terms service site available content may agree without information damages postrank site services access content make post