User:Bohye Woo/nltk-Terms of Service: Difference between revisions
(→NLTK) |
(→NLTK) |
||
Line 83: | Line 83: | ||
Output top 50 words | |||
<source lang="python"> | |||
import sys | |||
import codecs | |||
import nltk | |||
from nltk.corpus import stopwords | |||
# NLTK's default English stopwords | |||
default_stopwords = set(nltk.corpus.stopwords.words('english')) | |||
#read stop words from a file (one stopword per line, UTF-8) | |||
stopwords_file = './stopwords.txt' | |||
custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines()) | |||
all_stopwords = default_stopwords | custom_stopwords | |||
file = open('faceapp.txt','r') | |||
raw = file.read() | |||
tokens = nltk.word_tokenize(raw) | |||
faceapp = nltk.Text(tokens) | |||
# Remove single-character tokens (mostly punctuation) | |||
tokens = [word for word in tokens if len(word) > 1] | |||
# Remove numbers | |||
tokens = [word for word in tokens if not word.isnumeric()] | |||
# Lowercase all words (default_stopwords are lowercase too) | |||
tokens = [word.lower() for word in tokens] | |||
# Remove stopwords | |||
tokens = [word for word in tokens if word not in all_stopwords] | |||
# Calculate frequency distribution | |||
fdist = nltk.FreqDist(tokens) | |||
# Output top 50 words | |||
for word, frequency in fdist.most_common(10): | |||
print(u'{};{}'.format(word, frequency)) | |||
</source> | |||
<source lang="python"> | |||
//////resrult/////// | |||
content;16 | |||
user;14 | |||
services;13 | |||
faceapp;9 | |||
may;6 | |||
use;5 | |||
agreement;5 | |||
create;4 | |||
rights;4 | |||
account;4 | |||
feedback;3 | |||
without;3 | |||
grant;3 | |||
fully;3 | |||
paid;3 | |||
right;3 | |||
license;3 | |||
display;3 | |||
post;3 | |||
advertising;3 | |||
promotions;3 | |||
agree;3 | |||
materials;2 | |||
collectively;2 | |||
obligations;2 | |||
including;2 | |||
hereby;2 | |||
royalty-free;2 | |||
worldwide;2 | |||
reproduce;2 | |||
perform;2 | |||
distribute;2 | |||
adapt;2 | |||
modify;2 | |||
derivative;2 | |||
works;2 | |||
otherwise;2 | |||
manner;2 | |||
connection;2 | |||
third-party;2 | |||
platform;2 | |||
credentials;2 | |||
us;2 | |||
users;2 | |||
store;2 | |||
share;2 | |||
subject;2 | |||
acknowledge;2 | |||
reason;2 | |||
backup;2 | |||
</source> | |||
Revision as of 16:51, 23 March 2020
Virtual Environment
1. To create a virtual environment=
cd to the place you want to make it and...
python3 -m venv venv
2. To activate a virtual environment
cd to the folder where "venv" is and...
source venb/bin/activate
NLTK
Tokenize
>>> import nltk
>>> text = "If you choose to login to the Services via a third-party platform or social media network, you will need to use your credentials."
>>> token = nltk.word_tokenize(text)
>>> token
['If', 'you', 'choose', 'to', 'login', 'to', 'the', 'Services', 'via', 'a', 'third-party', 'platform', 'or', 'social', 'media', 'network', ',', 'you', 'will', 'need', 'to', 'use', 'your', 'credentials', '.']
sort
>>> token.sort()
>>> token
[',', '.', 'If', 'Services', 'a', 'choose', 'credentials', 'login', 'media', 'need', 'network', 'or', 'platform', 'social', 'the', 'third-party', 'to', 'to', 'to', 'use', 'via', 'will', 'you', 'you', 'your']
collections
>>> import collections
>>> collections.Counter(token)
Counter({'to': 3, 'you': 2, ',': 1, '.': 1, 'If': 1, 'Services': 1, 'a': 1, 'choose': 1, 'credentials': 1, 'login': 1, 'media': 1, 'need': 1, 'network': 1, 'or': 1, 'platform': 1, 'social': 1, 'the': 1, 'third-party': 1, 'use': 1, 'via': 1, 'will': 1, 'your': 1})
Concordance
nltk.download("stopwords")
file=open('faceapp.txt','r')
raw=file.read()
tokens = nltk.word_tokenize(raw)
faceapp = nltk.Text(tokens)
faceapp.concordance('services')
//////resrult///////
Displaying 11 of 11 matches:
t about FaceApp or our products or Services ( collectively , “ Feedback ” ) ,
e operation and maintenance of the Services and/or FaceApp ’ s business . - If
. - If you choose to login to the Services via a third-party platform or soci
connection with your account . Our Services may allow you and other users to c
nt that you post on or through the Services . You grant FaceApp a nonexclusive
ent solely to provide you with the Services . You acknowledge that some of the
. You acknowledge that some of the Services are supported by advertising reven
advertising and promotions on the Services or on , about , or in conjunction
at we may not always identify paid services , sponsored content , or commercia
modified by you on or through the Services in accordance with the rights and
tent you stylize on or through the Services ; and ( iii ) you have the legal r
Similar
nltk.download("stopwords")
file=open('faceapp.txt','r')
raw=file.read()
tokens = nltk.word_tokenize(raw)
faceapp = nltk.Text(tokens)
faceapp.concordance('services')
//////resrult///////
rights operation
Output top 50 words
import sys
import codecs
import nltk
from nltk.corpus import stopwords
# NLTK's default English stopwords
default_stopwords = set(nltk.corpus.stopwords.words('english'))
#read stop words from a file (one stopword per line, UTF-8)
stopwords_file = './stopwords.txt'
custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines())
all_stopwords = default_stopwords | custom_stopwords
file = open('faceapp.txt','r')
raw = file.read()
tokens = nltk.word_tokenize(raw)
faceapp = nltk.Text(tokens)
# Remove single-character tokens (mostly punctuation)
tokens = [word for word in tokens if len(word) > 1]
# Remove numbers
tokens = [word for word in tokens if not word.isnumeric()]
# Lowercase all words (default_stopwords are lowercase too)
tokens = [word.lower() for word in tokens]
# Remove stopwords
tokens = [word for word in tokens if word not in all_stopwords]
# Calculate frequency distribution
fdist = nltk.FreqDist(tokens)
# Output top 50 words
for word, frequency in fdist.most_common(10):
print(u'{};{}'.format(word, frequency))
//////resrult///////
content;16
user;14
services;13
faceapp;9
may;6
use;5
agreement;5
create;4
rights;4
account;4
feedback;3
without;3
grant;3
fully;3
paid;3
right;3
license;3
display;3
post;3
advertising;3
promotions;3
agree;3
materials;2
collectively;2
obligations;2
including;2
hereby;2
royalty-free;2
worldwide;2
reproduce;2
perform;2
distribute;2
adapt;2
modify;2
derivative;2
works;2
otherwise;2
manner;2
connection;2
third-party;2
platform;2
credentials;2
us;2
users;2
store;2
share;2
subject;2
acknowledge;2
reason;2
backup;2