|
|
Line 44: |
Line 44: |
| [[File:Screen Shot 2018-03-24 at 16.12.30.png|thumb|NLTK Analysis outcome]] | | [[File:Screen Shot 2018-03-24 at 16.12.30.png|thumb|NLTK Analysis outcome]] |
|
| |
|
| | | First NLTK Analysis in python3 |
| <code>import nltk
| |
| | |
| from nltk import word_tokenize
| |
| | |
| from nltk import FreqDist
| |
| | |
| from nltk.tokenize import sent_tokenize
| |
| | |
| from sys import stdin,stdout
| |
| | |
| import re
| |
| | |
| import sys, string
| |
| | |
| #importing nltk library word_tokenize
| |
| | |
| from collections import Counter
| |
| | |
| text = open ("readertest.txt")
| |
| content = text.read()
| |
| | |
| #importing and reading the content
| |
| | |
| #print(content)
| |
| | |
| words = content.split(" ")
| |
| | |
| #the string content needs to signifier - needs to be splitted to be able to read it, it detects if a new words begins based on the " "
| |
| | |
| | |
| splitting_statistic = sorted (set (words))
| |
| | |
| # the content is splitted
| |
| | |
| #print(splitting_statistic)
| |
| | |
| | |
| wordsamount_statistic = f'{len(words)} Amount of the words'
| |
| | |
| #amount of the words
| |
| | |
| print(wordsamount_statistic)
| |
| | |
| | |
| string=(content)
| |
| count1=0
| |
| count2=0
| |
| for i in string:
| |
| if(i.islower()):
| |
| count1=count1+1
| |
| elif(i.isupper()):
| |
| count2=count2+1
| |
| print("The number of lowercase characters is:")
| |
| print(count1)
| |
| print("The number of uppercase characters is:")
| |
| print(count2)
| |
| | |
| #counts the lowercase and uppercase letters in the text
| |
| | |
| | |
| fdist = FreqDist(content)
| |
| | |
| print("10 most common characters:")
| |
| print(fdist.most_common(10))
| |
| | |
| #print out the 10 most common letters
| |
| | |
| | |
| fdist = FreqDist(words)
| |
| | |
| print("10 most common words:")
| |
| print(fdist.most_common(10))
| |
| | |
| #print out the 10 most common words
| |
| | |
| | |
| #new_list = fdist.most_common()
| |
| | |
| #print(new_list)
| |
| | |
| | |
| #for word, _ in new_list: #_ ignores the second variable, dictionary (key, value)
| |
| #print(' ',_)
| |
|
| |
| #prints a list of the most common words - how to make it better in one line
| |
| | |
| | |
| | |
| def vowel_or_consonants (c):
| |
| if not c.isalpha():
| |
| return 'Neither'
| |
| vowels = 'aeiou'
| |
| | |
| if c.lower() in vowels:
| |
| return 'Vowel'
| |
| | |
| else:
| |
| return 'Consonant'
| |
| | |
| #for c in (content):
| |
| | |
| #print(c, vowel_or_consonants(c))
| |
|
| |
| | |
| #print(sent_tokenize(content))
| |
| | |
| #splitting text into sentences
| |
| | |
| | |
| #for word in (words):
| |
| #print(word)
| |
| | |
| #control structure, each word in a seperate line
| |
| | |
| | |
| #fdist = FreqDist(words)
| |
| | |
| #print("hapaxes:")
| |
| #print(fdist.hapaxes())
| |
| | |
| #words that occur once only, the so-called hapaxes
| |
| | |
| | |
| V = set(words)
| |
| long_words = [w for w in V if len(w) > 15]
| |
| | |
| print("printing the more than 15 character long words of the text")
| |
| print(sorted(long_words))
| |
| | |
| #printing the more than 15 character long words of the text
| |
| | |
| | |
| tokenized_content = word_tokenize(content)
| |
| | |
| #the content is tokenized (nltk library)
| |
| | |
| | |
| statistic3 = nltk.pos_tag(tokenized_content)
| |
| | |
| #each word becomes a tag if is a verb, noun, adverb, pronoun, adjective)
| |
| | |
| #print(statistic3)
| |
| | |
| | |
| verbscounter = 0
| |
| | |
| verblist = []
| |
| | |
| | |
| for word, tag in statistic3:
| |
| if tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
| |
| verbscounter = verbscounter + 1
| |
| verblist.append(word)
| |
| | |
| verb_statistic = f'{verbscounter} Verbs'
| |
| | |
| # shows the amount of verbs in the text
| |
| | |
| print(verb_statistic)
| |
| | |
| print(verblist)
| |
| | |
| #creating a list from the verb counter
| |
| #creating a dictionary from a list
| |
| | |
| nouncounter = 0
| |
| | |
| nounlist = []
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'NNP','NNS','NN', 'NNPS'}:
| |
| nouncounter = nouncounter + 1
| |
| nounlist.append(word)
| |
| | |
| nouns_statistic = f'{nouncounter} Nouns'
| |
| | |
| #shows the amount of nouns in the text
| |
| | |
| print(nouns_statistic)
| |
| | |
| print(nounlist)
| |
| | |
| | |
| verblist2 = verblist
| |
| | |
| nounlist2 = nounlist
| |
| | |
| verb_noun_dictionary = {}
| |
| | |
| for i in range (len(verblist2)):
| |
| verb_noun_dictionary[verblist2[i]] = nounlist2 [i]
| |
| | |
| verblist_and_nounlists = zip (verblist2, nounlist2)
| |
| | |
| verb_noun_dictionary = dict(verblist_and_nounlists)
| |
| | |
| verblist_and_nounlists = dict(zip(verblist2, nounlist2))
| |
| | |
| print(verblist_and_nounlists)
| |
| | |
| print(len(verblist))
| |
| | |
| characters = [words]
| |
| | |
| #print(words)
| |
| '''from itertools import groupby
| |
| | |
| def n_letter_dictionary(string):
| |
| result = {}
| |
| for key, group in groupby(sorted(string.split(), key = lambda x: len(x)), lambda x: len(x)):
| |
| result[key] = list(group)
| |
| return result
| |
| | |
| print(n_letter_dictionary)'''
| |
| | |
| | |
| adverbscounter = 0
| |
| | |
| adverblist = []
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'RB','RBR','RBS','WRB'}:
| |
| adverbscounter = adverbscounter + 1
| |
| adverblist.append(word)
| |
| | |
| | |
| adverb_statistic = f'{adverbscounter} Adverbs'
| |
| | |
| #shows the amount of adverbs in the text
| |
| | |
| print(adverb_statistic)
| |
| print(adverblist)
| |
| | |
| | |
| pronounscounter = 0
| |
| pronounslist = []
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'PRP','PRP$'}:
| |
| pronounscounter = pronounscounter + 1
| |
| pronounslist.append(word)
| |
| | |
| pronoun_statistic = f'{pronounscounter} Pronouns'
| |
| | |
| #shows the amount of pronouns in the text
| |
| | |
| print(pronoun_statistic)
| |
| | |
| print(pronounslist)
| |
| adjectivscounter = 0
| |
| | |
| adjectivslist = []
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'JJ','JJR','JJS'}:
| |
| adjectivscounter = adjectivscounter + 1
| |
| adjectivslist.append(word)
| |
| | |
| adjectiv_statistic = f'{adjectivscounter} Adjectives'
| |
| | |
| #shows the amount of adjectives in the text
| |
| | |
| print(adjectiv_statistic)
| |
| print(adjectivslist)
| |
| | |
| coordinating_conjuction_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'CC'}:
| |
| coordinating_conjuction_counter = coordinating_conjuction_counter + 1
| |
| | |
| coordinating_conjuction_statistic = f'{coordinating_conjuction_counter} Coordinating conjuctions'
| |
| | |
| #shows the amount of coordinating_conjuction in the text
| |
| | |
| print(coordinating_conjuction_statistic)
| |
| | |
| | |
| cardinal_number = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'CC'}:
| |
| cardinal_number = cardinal_number + 1
| |
| | |
| cardinal_number_statistic = f'{cardinal_number} Cardinal numbers'
| |
| | |
| #shows the amount of cardinal_number in the text
| |
| | |
| print(cardinal_number_statistic)
| |
| | |
| | |
| determiner_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'D'}:
| |
| determiner_counter = determiner_counter + 1
| |
| | |
| determiner_statistic = f'{determiner_counter} Determiners'
| |
| | |
| #shows the amount of Determiners in the text
| |
| | |
| print(determiner_statistic)
| |
| | |
| | |
| existential_there_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'EX'}:
| |
| existential_there_counter = existential_there_counter + 1
| |
| | |
| existential_there_statistic = f'{existential_there_counter} Existential there'
| |
| | |
| #shows the amount of Existential there in the text
| |
| | |
| print(existential_there_statistic)
| |
| | |
| foreing_words_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'FW'}:
| |
| foreing_words_counter = foreing_words_counter + 1
| |
| | |
| foreing_words_statistic = f'{foreing_words_counter} Foreing words'
| |
| | |
| #shows the amount of foreing words in the text
| |
| | |
| print(foreing_words_statistic)
| |
| preposition_or_subordinating_conjunctionlist = []
| |
| | |
| preposition_or_subordinating_conjunction_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'IN'}:
| |
| preposition_or_subordinating_conjunction_counter = preposition_or_subordinating_conjunction_counter + 1
| |
| preposition_or_subordinating_conjunctionlist.append(word)
| |
| preposition_or_subordinating_conjunction_statistic = f'{preposition_or_subordinating_conjunction_counter} Preposition or subordinating conjunctions'
| |
| | |
| #shows the amount of preposition_or_subordinating_conjunction in the text
| |
| | |
| print(preposition_or_subordinating_conjunction_statistic)
| |
| | |
| print(preposition_or_subordinating_conjunctionlist)
| |
| | |
| list_item_marker_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'LS'}:
| |
| list_item_marker_counter = list_item_marker_counter + 1
| |
| | |
| list_item_marker_statistic = f'{list_item_marker_counter} List item markers'
| |
| | |
| #shows the amount of list item markers in the text
| |
| | |
| print(list_item_marker_statistic )
| |
| | |
| | |
| modals_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'LS'}:
| |
| modals_counter = modals_counter + 1
| |
| | |
| modals_statistic = f'{modals_counter} Modals'
| |
| | |
| #shows the amount of modals in the text
| |
| | |
| print(modals_statistic)
| |
| | |
| Predeterminer_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'PDT'}:
| |
| Predeterminer_counter = Predeterminer_counter + 1
| |
| | |
| Predeterminer_statistic = f'{Predeterminer_counter } Predeterminers'
| |
| | |
| #shows the amount of Predeterminers in the text
| |
| | |
| print(Predeterminer_statistic)
| |
| | |
| | |
| Possessive_ending_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'PDT'}:
| |
| Possessive_ending_counter = Possessive_ending_counter + 1
| |
| | |
| Possessive_ending_statistic = f'{Possessive_ending_counter} Possessive endings'
| |
| | |
| #shows the amount of Possessive endings in the text
| |
| | |
| print(Possessive_ending_statistic)
| |
| | |
| | |
| particle_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'RP'}:
| |
| Particle_counter = particle_counter + 1
| |
| | |
| particle_statistic = f'{particle_counter} Particles'
| |
| | |
| #shows the amount of Particles endings in the text
| |
| | |
| print(particle_statistic)
| |
| | |
| | |
| symbol_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'SYM'}:
| |
| symbol_counter = symbol_counter + 1
| |
| | |
| symbol_statistic = f'{symbol_counter} Symbols'
| |
| | |
| #shows the amount of symbols in the text
| |
| | |
| print(symbol_statistic)
| |
| | |
| | |
| to_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'TO'}:
| |
| to_counter = to_counter + 1
| |
| | |
| to_statistic = f'{to_counter} to'
| |
| | |
| #shows the amount of to in the text
| |
| | |
| print(to_statistic)
| |
| | |
| | |
| interjection_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'TO'}:
| |
| interjection_counter = interjection_counter + 1
| |
| | |
| interjection_statistic = f'{interjection_counter} Interjections'
| |
| | |
| #shows the amount of interjections in the text
| |
| | |
| print(interjection_statistic)
| |
| | |
| | |
| Wh_determiner_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'TO'}:
| |
| Wh_determiner_counter = Wh_determiner_counter + 1
| |
| | |
| Wh_determiner_statistic = f'{Wh_determiner_counter} Wh determiners'
| |
| | |
| #shows the amount of Wh determiners in the text
| |
| | |
| print(Wh_determiner_statistic)
| |
| | |
| | |
| Wh_pronoun_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'TO'}:
| |
| Wh_pronoun_counter = Wh_pronoun_counter + 1
| |
| | |
| Wh_pronoun_statistic = f'{Wh_pronoun_counter} Wh pronouns'
| |
| | |
| #shows the amount of Wh pronouns in the text
| |
| | |
| print(Wh_pronoun_statistic)
| |
| | |
| | |
| Possessive_wh_pronoun_counter = 0
| |
| | |
| for word, tag in statistic3:
| |
| if tag in {'TO'}:
| |
| Possessive_wh_pronoun_counter = Possessive_wh_pronoun_counter + 1
| |
| | |
| Possessive_wh_pronoun_statistic = f'{Possessive_wh_pronoun_counter} Possessive wh pronouns'
| |
| | |
| #shows the amount of Possessive wh pronouns in the text
| |
| | |
| print(Possessive_wh_pronoun_statistic)
| |
| | |
| dic1 =([len (i) for i in verblist])
| |
| print(dic1)
| |
| | |
| dic2=([len (i) for i in nounlist])
| |
| print(dic2)
| |
| | |
| dic3=([len (i) for i in adjectivslist])
| |
| print(dic3)
| |
| | |
| dic4=([len (i) for i in preposition_or_subordinating_conjunctionlist])
| |
| print(dic4)
| |
| #print([len (i) for i in verblist_and_nounlists])
| |
| #print([len (i) for i in words])
| |
| | |
| | |
| | |
| double_numbers1 = []
| |
| for n in dic1:
| |
| double_numbers1.append(n*100)
| |
| print(double_numbers1)
| |
| | |
| double_numbers2 = []
| |
| for n in dic2:
| |
| double_numbers2.append(n*100)
| |
| print(double_numbers2)
| |
| | |
| double_numbers3 = []
| |
| for n in dic3:
| |
| double_numbers3.append(n*100)
| |
| print(double_numbers3)
| |
| | |
| double_numbers4 = []
| |
| for n in dic4:
| |
| double_numbers4.append(n*100)
| |
| print(double_numbers4)
| |
| | |
| div_numbers1= []
| |
| for n in dic1:
| |
| div_numbers1.append(n/100)
| |
| print(div_numbers1)
| |
| | |
| div_numbers2= []
| |
| for n in dic2:
| |
| div_numbers2.append(n/100)
| |
| print(div_numbers2)
| |
| | |
| div_numbers3= []
| |
| for n in dic3:
| |
| div_numbers3.append(n/100)
| |
| print(div_numbers3)
| |
| | |
| div_numbers4= []
| |
| for n in dic4:
| |
| div_numbers4.append(n/100)
| |
| print(div_numbers4)
| |
| | |
| | |
| '''lst1 = [[double_numbers1], [double_numbers2], [double_numbers3], [double_numbers4]]
| |
| print((zip(*lst1))[0])'''
| |
| | |
| '''lst1 = [[double_numbers1], [double_numbers2], [double_numbers3], [double_numbers4]]
| |
| lst2 = []
| |
| lst2.append([x[0]for x in lst1])
| |
| print(lst2 [0])'''
| |
| | |
| '''lst1 = [[double_numbers1], [double_numbers2], [double_numbers3], [double_numbers4]]
| |
| outputlist = []
| |
| for values in lst1:
| |
| outputlist.append(values[-1])
| |
| print(outputlist)'''
| |
| | |
| | |
| n1 = double_numbers1
| |
| n1_a = (n1[0])
| |
| print(n1_a)
| |
| | |
| n2 = double_numbers2
| |
| #print(n2[0])
| |
| | |
| n3 = double_numbers3
| |
| #print(n3[0])
| |
| | |
| n4 = double_numbers4
| |
| #print(n4[0])
| |
| | |
| n5 = double_numbers1
| |
| #print(n5[1])
| |
| | |
| n6 = double_numbers2
| |
| #print(n6[1])
| |
| | |
| n7 = double_numbers3
| |
| #print(n7[1])
| |
| | |
| n8 = double_numbers3
| |
| #print(n8[1])
| |
| | |
| print((n1[0], n2[0]), (n3[0], n4[0]), (n5[1], n6[1]), (n7[1], n8[1]))
| |
| | |
| n1a = div_numbers1
| |
| #print(n1a[0])
| |
| | |
| n2a = div_numbers2
| |
| #print(n2a[0])
| |
| | |
| n3a = div_numbers3
| |
| #print(n3a[0])
| |
| | |
| n4a = div_numbers4
| |
| #print(n4a[0])
| |
| | |
| print(n1a[0], n2a[0], n3a[0], n4a[0])
| |
| | |
| text_file = open ("Output.txt", "w")
| |
| | |
| text_file.write(n1_a)
| |
| text_file.close()
| |
| | |
| | |
| | |
| | |
| wordsnumber_statistic = len(content.split())
| |
| | |
| #number of words
| |
| | |
| #print(wordsnumber_statistic)
| |
| | |
| | |
| numberoflines_statistic = len(content.splitlines())
| |
| | |
| #number of lines
| |
| | |
| print("Number of lines:")
| |
| print(numberoflines_statistic)
| |
| | |
| | |
| numberofcharacters_statistic = len(content)
| |
| | |
| #number of characters
| |
| | |
| print("Number of characters:")
| |
| print(numberofcharacters_statistic)
| |
| | |
| | |
| d ={}
| |
| | |
| for word in words:
| |
| d[word] = d.get(word, 0) + 1
| |
| | |
| #how many times a word accuers in the text, not sorted yet(next step)
| |
| | |
| #print(d)
| |
| | |
| | |
| word_freq =[]
| |
| | |
| for key, value in d.items():
| |
| word_freq.append((value, key))
| |
| | |
| #sorted the word count - converting a dictionary into a list
| |
| | |
| #print(word_freq)
| |
| | |
| | |
| lettercounter = Counter(content)
| |
| | |
| #counts the letters in the text
| |
| | |
| #print(lettercounter)</code>
| |
| | |
| | |
| | |
|
| |
|
|
| |
|