PythonLabZalan: Difference between revisions
No edit summary |
|||
Line 51: | Line 51: | ||
For the NLTK text analysis I used one of pages of my reader. | For the NLTK text analysis I used one of pages of my reader. | ||
First NLTK Analysis in python3 ( | First NLTK Analysis in python3 (see below) to get different data from the textual input such as (see NLTK analysis outcome): | ||
'''NLTK Analysis''' | |||
*Amount of words | *Amount of words | ||
Line 65: | Line 68: | ||
*Amount of Adjectives | *Amount of Adjectives | ||
*Amount of lines | *Amount of lines | ||
'''NLTK Analysis Script''' | |||
<syntaxhighlight lang="python" line='line'> | <syntaxhighlight lang="python" line='line'> |
Revision as of 16:47, 24 March 2018
Terminal
Firstly I looked into basic command line functions File:Commands terminal.pdf and their operations for creating a solid base for Python3.
Optical character recognition + Tesseract
Secondarily I experimented in Terminal how to translate PDF or JPG to .txt files with tesseract and imagemagick (convert).
Tesseract (with languages you will be using)
- Mac
brew install tesseract --all-languages
imagemagick
- Mac
brew install imagemagick
How to use it?
tesseract - png - name of the txt file
tesseracttest SZAKACS$ tesseract namefile.png text2.txt
Getting 1 page from PDF file with PDFTK burst
pdftk yourfile.pdf burst
Or use imagemagick
convert -density 300 Typewriter\ Art\ -\ Riddell\ Alan.pdf Typewriter-%03d.tiff
Chose page you want to convert
Convert PDF to bit-map using imagemagick, with some options to optimize OCR
convert -density 300 page.pdf -depth 8 -strip -background white -alpha off ouput.tiff
-density 300
resolution 300DPI. Lower resolutions will create errors :)-depth 8
number of bits for color. 8bit depth == grey-scale-strip -background white -alpha off
removes alpha channel (opacity), and makes the background whiteoutput.tiff
in previous versions Tesseract only accepted images as tiffs, but currently more bitmap formats are accepted
Python3
To be able to understand how NLTK works I did an intensive python beginners learning week from 26.02.–04.03.2018.
Natural Language Tool Kit
For the NLTK text analysis I used one of pages of my reader. First NLTK Analysis in python3 (see below) to get different data from the textual input such as (see NLTK analysis outcome):
NLTK Analysis
- Amount of words
- The number of lowercase letters
- The number of uppercase letters
- 10 most common characters
- 10 most common words
- more than 15 character long words of the text
- Amount of Verbs
- Amount of Nouns
- Amount of Adverbs
- Amount of Pronouns
- Amount of Adjectives
- Amount of lines
NLTK Analysis Script
import nltk
from nltk import word_tokenize
from nltk import FreqDist
from nltk.tokenize import sent_tokenize
from sys import stdin,stdout
import re
import sys, string
#importing nltk library word_tokenize
from collections import Counter
text = open ("readertest.txt")
content = text.read()
#importing and reading the content
#print(content)
words = content.split(" ")
#the string content needs to signifier - needs to be splitted to be able to read it, it detects if a new words begins based on the " "
splitting_statistic = sorted (set (words))
# the content is splitted
#print(splitting_statistic)
wordsamount_statistic = f'{len(words)} Amount of the words'
#amount of the words
print(wordsamount_statistic)
string=(content)
count1=0
count2=0
for i in string:
if(i.islower()):
count1=count1+1
elif(i.isupper()):
count2=count2+1
print("The number of lowercase characters is:")
print(count1)
print("The number of uppercase characters is:")
print(count2)
#counts the lowercase and uppercase letters in the text
fdist = FreqDist(content)
print("10 most common characters:")
print(fdist.most_common(10))
#print out the 10 most common letters
fdist = FreqDist(words)
print("10 most common words:")
print(fdist.most_common(10))
#print out the 10 most common words
#new_list = fdist.most_common()
#print(new_list)
#for word, _ in new_list: #_ ignores the second variable, dictionary (key, value)
#print(' ',_)
#prints a list of the most common words - how to make it better in one line
def vowel_or_consonants (c):
if not c.isalpha():
return 'Neither'
vowels = 'aeiou'
if c.lower() in vowels:
return 'Vowel'
else:
return 'Consonant'
#for c in (content):
#print(c, vowel_or_consonants(c))
#print(sent_tokenize(content))
#splitting text into sentences
#for word in (words):
#print(word)
#control structure, each word in a seperate line
#fdist = FreqDist(words)
#print("hapaxes:")
#print(fdist.hapaxes())
#words that occur once only, the so-called hapaxes
V = set(words)
long_words = [w for w in V if len(w) > 15]
print("printing the more than 15 character long words of the text")
print(sorted(long_words))
#printing the more than 15 character long words of the text
tokenized_content = word_tokenize(content)
#the content is tokenized (nltk library)
statistic3 = nltk.pos_tag(tokenized_content)
#each word becomes a tag if is a verb, noun, adverb, pronoun, adjective)
#print(statistic3)
verbscounter = 0
verblist = []
for word, tag in statistic3:
if tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
verbscounter = verbscounter + 1
verblist.append(word)
verb_statistic = f'{verbscounter} Verbs'
# shows the amount of verbs in the text
print(verb_statistic)
print(verblist)
#creating a list from the verb counter
#creating a dictionary from a list
nouncounter = 0
nounlist = []
for word, tag in statistic3:
if tag in {'NNP','NNS','NN', 'NNPS'}:
nouncounter = nouncounter + 1
nounlist.append(word)
nouns_statistic = f'{nouncounter} Nouns'
#shows the amount of nouns in the text
print(nouns_statistic)
print(nounlist)
verblist2 = verblist
nounlist2 = nounlist
verb_noun_dictionary = {}
for i in range (len(verblist2)):
verb_noun_dictionary[verblist2[i]] = nounlist2 [i]
verblist_and_nounlists = zip (verblist2, nounlist2)
verb_noun_dictionary = dict(verblist_and_nounlists)
verblist_and_nounlists = dict(zip(verblist2, nounlist2))
print(verblist_and_nounlists)
print(len(verblist))
characters = [words]
#print(words)
'''from itertools import groupby
def n_letter_dictionary(string):
result = {}
for key, group in groupby(sorted(string.split(), key = lambda x: len(x)), lambda x: len(x)):
result[key] = list(group)
return result
print(n_letter_dictionary)'''
adverbscounter = 0
adverblist = []
for word, tag in statistic3:
if tag in {'RB','RBR','RBS','WRB'}:
adverbscounter = adverbscounter + 1
adverblist.append(word)
adverb_statistic = f'{adverbscounter} Adverbs'
#shows the amount of adverbs in the text
print(adverb_statistic)
print(adverblist)
pronounscounter = 0
pronounslist = []
for word, tag in statistic3:
if tag in {'PRP','PRP$'}:
pronounscounter = pronounscounter + 1
pronounslist.append(word)
pronoun_statistic = f'{pronounscounter} Pronouns'
#shows the amount of pronouns in the text
print(pronoun_statistic)
print(pronounslist)
adjectivscounter = 0
adjectivslist = []
for word, tag in statistic3:
if tag in {'JJ','JJR','JJS'}:
adjectivscounter = adjectivscounter + 1
adjectivslist.append(word)
adjectiv_statistic = f'{adjectivscounter} Adjectives'
#shows the amount of adjectives in the text
print(adjectiv_statistic)
print(adjectivslist)
coordinating_conjuction_counter = 0
for word, tag in statistic3:
if tag in {'CC'}:
coordinating_conjuction_counter = coordinating_conjuction_counter + 1
coordinating_conjuction_statistic = f'{coordinating_conjuction_counter} Coordinating conjuctions'
#shows the amount of coordinating_conjuction in the text
print(coordinating_conjuction_statistic)
cardinal_number = 0
for word, tag in statistic3:
if tag in {'CC'}:
cardinal_number = cardinal_number + 1
cardinal_number_statistic = f'{cardinal_number} Cardinal numbers'
#shows the amount of cardinal_number in the text
print(cardinal_number_statistic)
determiner_counter = 0
for word, tag in statistic3:
if tag in {'D'}:
determiner_counter = determiner_counter + 1
determiner_statistic = f'{determiner_counter} Determiners'
#shows the amount of Determiners in the text
print(determiner_statistic)
existential_there_counter = 0
for word, tag in statistic3:
if tag in {'EX'}:
existential_there_counter = existential_there_counter + 1
existential_there_statistic = f'{existential_there_counter} Existential there'
#shows the amount of Existential there in the text
print(existential_there_statistic)
foreing_words_counter = 0
for word, tag in statistic3:
if tag in {'FW'}:
foreing_words_counter = foreing_words_counter + 1
foreing_words_statistic = f'{foreing_words_counter} Foreing words'
#shows the amount of foreing words in the text
print(foreing_words_statistic)
preposition_or_subordinating_conjunctionlist = []
preposition_or_subordinating_conjunction_counter = 0
for word, tag in statistic3:
if tag in {'IN'}:
preposition_or_subordinating_conjunction_counter = preposition_or_subordinating_conjunction_counter + 1
preposition_or_subordinating_conjunctionlist.append(word)
preposition_or_subordinating_conjunction_statistic = f'{preposition_or_subordinating_conjunction_counter} Preposition or subordinating conjunctions'
#shows the amount of preposition_or_subordinating_conjunction in the text
print(preposition_or_subordinating_conjunction_statistic)
print(preposition_or_subordinating_conjunctionlist)
list_item_marker_counter = 0
for word, tag in statistic3:
if tag in {'LS'}:
list_item_marker_counter = list_item_marker_counter + 1
list_item_marker_statistic = f'{list_item_marker_counter} List item markers'
#shows the amount of list item markers in the text
print(list_item_marker_statistic )
modals_counter = 0
for word, tag in statistic3:
if tag in {'LS'}:
modals_counter = modals_counter + 1
modals_statistic = f'{modals_counter} Modals'
#shows the amount of modals in the text
print(modals_statistic)
Predeterminer_counter = 0
for word, tag in statistic3:
if tag in {'PDT'}:
Predeterminer_counter = Predeterminer_counter + 1
Predeterminer_statistic = f'{Predeterminer_counter } Predeterminers'
#shows the amount of Predeterminers in the text
print(Predeterminer_statistic)
Possessive_ending_counter = 0
for word, tag in statistic3:
if tag in {'PDT'}:
Possessive_ending_counter = Possessive_ending_counter + 1
Possessive_ending_statistic = f'{Possessive_ending_counter} Possessive endings'
#shows the amount of Possessive endings in the text
print(Possessive_ending_statistic)
particle_counter = 0
for word, tag in statistic3:
if tag in {'RP'}:
Particle_counter = particle_counter + 1
particle_statistic = f'{particle_counter} Particles'
#shows the amount of Particles endings in the text
print(particle_statistic)
symbol_counter = 0
for word, tag in statistic3:
if tag in {'SYM'}:
symbol_counter = symbol_counter + 1
symbol_statistic = f'{symbol_counter} Symbols'
#shows the amount of symbols in the text
print(symbol_statistic)
to_counter = 0
for word, tag in statistic3:
if tag in {'TO'}:
to_counter = to_counter + 1
to_statistic = f'{to_counter} to'
#shows the amount of to in the text
print(to_statistic)
interjection_counter = 0
for word, tag in statistic3:
if tag in {'TO'}:
interjection_counter = interjection_counter + 1
interjection_statistic = f'{interjection_counter} Interjections'
#shows the amount of interjections in the text
print(interjection_statistic)
Wh_determiner_counter = 0
for word, tag in statistic3:
if tag in {'TO'}:
Wh_determiner_counter = Wh_determiner_counter + 1
Wh_determiner_statistic = f'{Wh_determiner_counter} Wh determiners'
#shows the amount of Wh determiners in the text
print(Wh_determiner_statistic)
Wh_pronoun_counter = 0
for word, tag in statistic3:
if tag in {'TO'}:
Wh_pronoun_counter = Wh_pronoun_counter + 1
Wh_pronoun_statistic = f'{Wh_pronoun_counter} Wh pronouns'
#shows the amount of Wh pronouns in the text
print(Wh_pronoun_statistic)
Possessive_wh_pronoun_counter = 0
for word, tag in statistic3:
if tag in {'TO'}:
Possessive_wh_pronoun_counter = Possessive_wh_pronoun_counter + 1
Possessive_wh_pronoun_statistic = f'{Possessive_wh_pronoun_counter} Possessive wh pronouns'
#shows the amount of Possessive wh pronouns in the text
print(Possessive_wh_pronoun_statistic)
dic1 =([len (i) for i in verblist])
print(dic1)
dic2=([len (i) for i in nounlist])
print(dic2)
dic3=([len (i) for i in adjectivslist])
print(dic3)
dic4=([len (i) for i in preposition_or_subordinating_conjunctionlist])
print(dic4)
#print([len (i) for i in verblist_and_nounlists])
#print([len (i) for i in words])
double_numbers1 = []
for n in dic1:
double_numbers1.append(n*100)
print(double_numbers1)
double_numbers2 = []
for n in dic2:
double_numbers2.append(n*100)
print(double_numbers2)
double_numbers3 = []
for n in dic3:
double_numbers3.append(n*100)
print(double_numbers3)
double_numbers4 = []
for n in dic4:
double_numbers4.append(n*100)
print(double_numbers4)
div_numbers1= []
for n in dic1:
div_numbers1.append(n/100)
print(div_numbers1)
div_numbers2= []
for n in dic2:
div_numbers2.append(n/100)
print(div_numbers2)
div_numbers3= []
for n in dic3:
div_numbers3.append(n/100)
print(div_numbers3)
div_numbers4= []
for n in dic4:
div_numbers4.append(n/100)
print(div_numbers4)
'''lst1 = [[double_numbers1], [double_numbers2], [double_numbers3], [double_numbers4]]
print((zip(*lst1))[0])'''
'''lst1 = [[double_numbers1], [double_numbers2], [double_numbers3], [double_numbers4]]
lst2 = []
lst2.append([x[0]for x in lst1])
print(lst2 [0])'''
'''lst1 = [[double_numbers1], [double_numbers2], [double_numbers3], [double_numbers4]]
outputlist = []
for values in lst1:
outputlist.append(values[-1])
print(outputlist)'''
n1 = double_numbers1
n1_a = (n1[0])
print(n1_a)
n2 = double_numbers2
#print(n2[0])
n3 = double_numbers3
#print(n3[0])
n4 = double_numbers4
#print(n4[0])
n5 = double_numbers1
#print(n5[1])
n6 = double_numbers2
#print(n6[1])
n7 = double_numbers3
#print(n7[1])
n8 = double_numbers3
#print(n8[1])
print((n1[0], n2[0]), (n3[0], n4[0]), (n5[1], n6[1]), (n7[1], n8[1]))
n1a = div_numbers1
#print(n1a[0])
n2a = div_numbers2
#print(n2a[0])
n3a = div_numbers3
#print(n3a[0])
n4a = div_numbers4
#print(n4a[0])
print(n1a[0], n2a[0], n3a[0], n4a[0])
text_file = open ("Output.txt", "w")
text_file.write(n1_a)
text_file.close()
wordsnumber_statistic = len(content.split())
#number of words
#print(wordsnumber_statistic)
numberoflines_statistic = len(content.splitlines())
#number of lines
print("Number of lines:")
print(numberoflines_statistic)
numberofcharacters_statistic = len(content)
#number of characters
print("Number of characters:")
print(numberofcharacters_statistic)
d ={}
for word in words:
d[word] = d.get(word, 0) + 1
#how many times a word accuers in the text, not sorted yet(next step)
#print(d)
word_freq =[]
for key, value in d.items():
word_freq.append((value, key))
#sorted the word count - converting a dictionary into a list
#print(word_freq)
lettercounter = Counter(content)
#counts the letters in the text
#print(lettercounter)