Goodiff TOS word frequency

From XPUB & Lens-Based wiki
TOS selected words frequency on timeline

By Dusan and Natasa, at GooDiff.org workshop with Nicolas Maleve, Michael and Aymeric

Code returns the frequencies of selected words in TOS defined in path variable. Result is in CSV format, which was then imported to Google Spreadsheets (time pressure: quick & dirty solution).

Python code
from git.repository import Repository
 
r = Repository("./dataset")
#path = "google/gmail.google.com/mail/help/terms_of_use.html"
path = "facebook/www.facebook.com/terms.php"
#path = "ebay/www.skype.com/company/legal/privacy/privacy_general.html"
path = path.split("/")
 
def treewalk(tree, path):
    if not path:
        return tree
    if path[0] in tree:
        child = tree[path[0]]
        return treewalk(child, path[1:])
 
import codecs
import nltk.util
import datetime
from operator import itemgetter
from nltk.corpus import stopwords

print "date;you;we;share;privacy;please;friend;control"

seen = {}
count = 0
noname = None
for r in r.rev_list():
    b = treewalk(r.tree, path)
    if b and b.name not in seen and datetime.datetime.fromtimestamp(float(r._committer_timestamp)) < datetime.datetime(2009, 11, 26, 18, 10): #last condition is to avoid French versions of TOS 
        seen[b.name] = True
#        print r.shortname
        count += 1
	str1 = nltk.util.clean_html(b.contents)

	# create a list of words separated at whitespaces
	english_stops = set(stopwords.words("english"))
	wordList1 = str1.split(None)
	wordList1 = [w for w in wordList1 if w not in english_stops]

	# strip any punctuation marks and build modified word list
	# start with an empty list
	wordList2 = []
	for word1 in wordList1:
	    # last character of each word
	    lastchar = word1[-1:]
	    # use a list of punctuation marks
	    if lastchar in [",", ".", "!", "?", ";", "s", ")"]:
	        word2 = word1.rstrip(lastchar)
	    else:
	        word2 = word1
	    # build a wordList of lower case modified words
            word2 = word2.lower()
            if word2 in ["priv&eacute;e","priv&eacute;"]: 
                word2 = "privacy"
            if word2 in "vous": 
                word2 = "you"
            if word2 in "nous": 
                word2 = "we"
            if word2 in ["contr&ocirc;le","contr&ocirc;ler"]: 
                word2 = "control"
            if word2 in ["confiance","confier"]: 
                word2 = "trust"
            if word2 in ["r&eacute;clame","publicit&eacute;"]: 
                word2 = "advertisement"
            if word2 in "plait": 
                word2 = "please"
	    wordList2.append(word2)

	# create a wordfrequency dictionary
	# start with an empty dictionaryimport nltk.util
	freqD2 = {}
	for word2 in wordList2:
	    freqD2[word2] = freqD2.get(word2, 0) + 1
    
	# create a list of keys and sort the list
	# all words are lower case already
	keyList = freqD2.keys()
	keyList_sorted = sorted(freqD2.items(), key=itemgetter(0)) # or sort by count: change to (1)
        keyList_sorted.reverse()

	print str(datetime.datetime.fromtimestamp(float(r._committer_timestamp))),
	for pair in keyList_sorted:
	  if pair[0] in ["friend","we","you","privacy","share","control","please"]:
#	    print str(datetime.datetime.fromtimestamp(float(r._committer_timestamp))) +";"+ pair[0] +";"+ str(pair[1])
	    print ";"+str(pair[1]),
        print ""
Result

Ideas

  • Distance/similarity between two documents? How the facebook privacy is different from the Google privacy policy? You can use frequency vector of each document to compare the two. That would allow to see if there are major differences between company's approach to privacy. It could be even a nice visualization.