Goodiff TOS word frequency

From XPUB & Lens-Based wiki
Revision as of 19:39, 17 March 2011 by Dusan (talk | contribs) (Created page with "; TOS selected words frequency on timeline By Dusan and Natasa, at Goodiff.org workshop with Nicolas Maleve, Michael and Aymeric Code returns the frequencies of selected word...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
TOS selected words frequency on timeline

By Dusan and Natasa, at Goodiff.org workshop with Nicolas Maleve, Michael and Aymeric

Code returns the frequencies of selected words in TOS defined in path variable. Result is in CSV format, which was then imported to Google Spreadsheets (time pressure: quick & dirty solution).

Python code

from git.repository import Repository

r = Repository("./dataset")

  1. path = "google/gmail.google.com/mail/help/terms_of_use.html"

path = "facebook/www.facebook.com/terms.php"

  1. path = "ebay/www.skype.com/company/legal/privacy/privacy_general.html"

path = path.split("/")

def treewalk(tree, path):

   if not path:
       return tree
   if path[0] in tree:
       child = tree[path[0]]
       return treewalk(child, path[1:])

import codecs import nltk.util import datetime from operator import itemgetter from nltk.corpus import stopwords

print "date;you;we;share;privacy;please;friend;control"

seen = {} count = 0 noname = None for r in r.rev_list():

   b = treewalk(r.tree, path)
   if b and b.name not in seen and datetime.datetime.fromtimestamp(float(r._committer_timestamp)) < datetime.datetime(2009, 11, 26, 18, 10): #last condition is to avoid French versions of TOS 
       seen[b.name] = True
  1. print r.shortname
       count += 1

str1 = nltk.util.clean_html(b.contents)

# create a list of words separated at whitespaces english_stops = set(stopwords.words("english")) wordList1 = str1.split(None) wordList1 = [w for w in wordList1 if w not in english_stops]

# strip any punctuation marks and build modified word list # start with an empty list wordList2 = [] for word1 in wordList1: # last character of each word lastchar = word1[-1:] # use a list of punctuation marks if lastchar in [",", ".", "!", "?", ";", "s", ")"]: word2 = word1.rstrip(lastchar) else: word2 = word1 # build a wordList of lower case modified words

           word2 = word2.lower()
           if word2 in ["privée","privé"]: 
               word2 = "privacy"
           if word2 in "vous": 
               word2 = "you"
           if word2 in "nous": 
               word2 = "we"
           if word2 in ["contrôle","contrôler"]: 
               word2 = "control"
           if word2 in ["confiance","confier"]: 
               word2 = "trust"
           if word2 in ["réclame","publicité"]: 
               word2 = "advertisement"
           if word2 in "plait": 
               word2 = "please"

wordList2.append(word2)

# create a wordfrequency dictionary # start with an empty dictionaryimport nltk.util freqD2 = {} for word2 in wordList2: freqD2[word2] = freqD2.get(word2, 0) + 1

# create a list of keys and sort the list # all words are lower case already keyList = freqD2.keys() keyList_sorted = sorted(freqD2.items(), key=itemgetter(0)) # or sort by count: change to (1)

       keyList_sorted.reverse()

print str(datetime.datetime.fromtimestamp(float(r._committer_timestamp))), for pair in keyList_sorted: if pair[0] in ["friend","we","you","privacy","share","control","please"]:

  1. print str(datetime.datetime.fromtimestamp(float(r._committer_timestamp))) +";"+ pair[0] +";"+ str(pair[1])

print ";"+str(pair[1]),

       print ""

Result