User:Bohye Woo/nltk-Terms of Service: Difference between revisions

Revision as of 16:51, 23 March 2020

Virtual Environment

1. To create a virtual environment=

cd to the place you want to make it and...

    python3 -m venv venv

2. To activate a virtual environment

cd to the folder where "venv" is and...

    source venb/bin/activate

3. NLTK

Tokenize

>>> import nltk
>>> text = "If you choose to login to the Services via a third-party platform or social media network, you will need to use your credentials."
>>> token = nltk.word_tokenize(text)
>>> token
['If', 'you', 'choose', 'to', 'login', 'to', 'the', 'Services', 'via', 'a', 'third-party', 'platform', 'or', 'social', 'media', 'network', ',', 'you', 'will', 'need', 'to', 'use', 'your', 'credentials', '.']

sort

>>> token.sort()
>>> token
[',', '.', 'If', 'Services', 'a', 'choose', 'credentials', 'login', 'media', 'need', 'network', 'or', 'platform', 'social', 'the', 'third-party', 'to', 'to', 'to', 'use', 'via', 'will', 'you', 'you', 'your']

collections

>>> import collections
>>> collections.Counter(token)
Counter({'to': 3, 'you': 2, ',': 1, '.': 1, 'If': 1, 'Services': 1, 'a': 1, 'choose': 1, 'credentials': 1, 'login': 1, 'media': 1, 'need': 1, 'network': 1, 'or': 1, 'platform': 1, 'social': 1, 'the': 1, 'third-party': 1, 'use': 1, 'via': 1, 'will': 1, 'your': 1})

Concordance

nltk.download("stopwords")

file=open('faceapp.txt','r')
raw=file.read()
tokens = nltk.word_tokenize(raw)
faceapp = nltk.Text(tokens)

faceapp.concordance('services')

//////resrult///////
Displaying 11 of 11 matches:
t about FaceApp or our products or Services ( collectively , “ Feedback ” ) , 
e operation and maintenance of the Services and/or FaceApp ’ s business . - If
 . - If you choose to login to the Services via a third-party platform or soci
connection with your account . Our Services may allow you and other users to c
nt that you post on or through the Services . You grant FaceApp a nonexclusive
ent solely to provide you with the Services . You acknowledge that some of the
. You acknowledge that some of the Services are supported by advertising reven
 advertising and promotions on the Services or on , about , or in conjunction 
at we may not always identify paid services , sponsored content , or commercia
 modified by you on or through the Services in accordance with the rights and 
tent you stylize on or through the Services ; and ( iii ) you have the legal r

Similar

nltk.download("stopwords")

file=open('faceapp.txt','r')
raw=file.read()
tokens = nltk.word_tokenize(raw)
faceapp = nltk.Text(tokens)

faceapp.concordance('services')

//////resrult///////
rights operation

Output top 50 words

import sys
import codecs
import nltk
from nltk.corpus import stopwords

# NLTK's default English stopwords
default_stopwords = set(nltk.corpus.stopwords.words('english'))

#read stop words from a file (one stopword per line, UTF-8)
stopwords_file = './stopwords.txt'
custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines())

all_stopwords = default_stopwords | custom_stopwords

file = open('faceapp.txt','r')
raw = file.read()
tokens = nltk.word_tokenize(raw)
faceapp = nltk.Text(tokens)

# Remove single-character tokens (mostly punctuation)
tokens = [word for word in tokens if len(word) > 1]

# Remove numbers
tokens = [word for word in tokens if not word.isnumeric()]

# Lowercase all words (default_stopwords are lowercase too)
tokens = [word.lower() for word in tokens]

# Remove stopwords
tokens = [word for word in tokens if word not in all_stopwords]

# Calculate frequency distribution
fdist = nltk.FreqDist(tokens)

# Output top 50 words
for word, frequency in fdist.most_common(10):
    print(u'{};{}'.format(word, frequency))

//////resrult///////
content;16
user;14
services;13
faceapp;9
may;6
use;5
agreement;5
create;4
rights;4
account;4
feedback;3
without;3
grant;3
fully;3
paid;3
right;3
license;3
display;3
post;3
advertising;3
promotions;3
agree;3
materials;2
collectively;2
obligations;2
including;2
hereby;2
royalty-free;2
worldwide;2
reproduce;2
perform;2
distribute;2
adapt;2
modify;2
derivative;2
works;2
otherwise;2
manner;2
connection;2
third-party;2
platform;2
credentials;2
us;2
users;2
store;2
share;2
subject;2
acknowledge;2
reason;2
backup;2

@@ Line 12: / Line 12: @@
 </source>
-===NLTK===
+===3. NLTK===
 Tokenize
 <source lang="python">
@@ Line 124: / Line 124: @@
 </source>
 <source lang="python">
 //////resrult///////