Revision as of 17:33, 24 March 2018

Terminal

Firstly I looked into basic command line functions File:Commands terminal.pdf and their operations for creating a solid base for Python3.

Optical character recognition + Tesseract

Secondarily I experimented in Terminal how to translate PDF or JPG to .txt files with tesseract and imagemagick (convert).

Optical character recognition

Input 1

Output 1

Tesseract (with languages you will be using)

Mac brew install tesseract --all-languages

imagemagick

Mac brew install imagemagick

How to use it?

tesseract - png - name of the txt file

tesseracttest SZAKACS$ tesseract namefile.png text2.txt

Getting 1 page from PDF file with PDFTK burst

pdftk yourfile.pdf burst

Or use imagemagick

convert -density 300 Typewriter\ Art\ -\ Riddell\ Alan.pdf Typewriter-%03d.tiff

Chose page you want to convert

Convert PDF to bit-map using imagemagick, with some options to optimize OCR

convert -density 300 page.pdf -depth 8 -strip -background white -alpha off ouput.tiff
-density 300 resolution 300DPI. Lower resolutions will create errors :)
-depth 8number of bits for color. 8bit depth == grey-scale
-strip -background white -alpha off removes alpha channel (opacity), and makes the background white
output.tiffin previous versions Tesseract only accepted images as tiffs, but currently more bitmap formats are accepted

Python3

Input 2

Output 2

PythonLabZalan: Difference between revisions

Revision as of 17:33, 24 March 2018

Contents

Terminal

Optical character recognition + Tesseract

Python3

Natural Language Tool Kit

DrawBot

ACCP (Analogue Circular Communication Protocol

@@ Line 44: / Line 44: @@
 [[File:Screen Shot 2018-03-24 at 16.12.30.png|thumb|NLTK Analysis outcome]]
+First NLTK Analysis in python3
-<code>import nltk
-from nltk import word_tokenize
-from nltk import FreqDist
-from nltk.tokenize import sent_tokenize
-from sys import stdin,stdout
-import re
-import sys, string
-#importing nltk library word_tokenize
-from collections import Counter
-text = open ("readertest.txt")
-content = text.read()
-#importing and reading the content
-#print(content)
-words = content.split(" ")
-#the string content needs to signifier - needs to be splitted to be able to read it, it detects if a new words begins based on the " "
-splitting_statistic = sorted (set (words))
-# the content is splitted
-#print(splitting_statistic)
-wordsamount_statistic = f'{len(words)} Amount of the words'
-#amount of the words
-print(wordsamount_statistic)
-string=(content)
-count1=0
-count2=0
-for i in string:
-      if(i.islower()):
-            count1=count1+1
-      elif(i.isupper()):
-            count2=count2+1
-print("The number of lowercase characters is:")
-print(count1)
-print("The number of uppercase characters is:")
-print(count2)
-#counts the lowercase and uppercase letters in the text
-fdist = FreqDist(content)
-print("10 most common characters:")
-print(fdist.most_common(10))
-#print out the 10 most common letters
-fdist = FreqDist(words)
-print("10 most common words:")
-print(fdist.most_common(10))
-#print out the 10 most common words
-#new_list = fdist.most_common()
-#print(new_list)
-#for word, _ in new_list:  #_ ignores the second variable, dictionary (key, value)
-    #print(' ',_)
-#prints a list of the most common words - how to make it better in one line
-def vowel_or_consonants (c):
-	if not c.isalpha():
-		return 'Neither'
-	vowels = 'aeiou'
-	if c.lower() in vowels:
-		return 'Vowel'
-	else:
-		return 'Consonant'
-#for c in (content):
-	#print(c, vowel_or_consonants(c))
-#print(sent_tokenize(content))
-#splitting text into sentences
-#for word in (words):
-	#print(word)
-#control structure, each word in a seperate line
-#fdist = FreqDist(words)
-#print("hapaxes:")
-#print(fdist.hapaxes())
-#words that occur once only, the so-called hapaxes
-V = set(words)
-long_words = [w for w in V if len(w) > 15]
-print("printing the more than 15 character long words of the text")
-print(sorted(long_words))
-#printing the more than 15 character long words of the text
-tokenized_content = word_tokenize(content)
-#the content is tokenized (nltk library)
-statistic3 = nltk.pos_tag(tokenized_content)
-#each word becomes a tag if is a verb, noun, adverb, pronoun, adjective)
-#print(statistic3)
-verbscounter = 0
-verblist = []
-for word, tag in statistic3:
-	if tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
-		verbscounter = verbscounter + 1
-		verblist.append(word)
-verb_statistic = f'{verbscounter} Verbs'
-# shows the amount of verbs in the text
-print(verb_statistic)
-print(verblist)
-#creating a list from the verb counter
-#creating a dictionary from a list
-nouncounter = 0
-nounlist = []
-for word, tag in statistic3:
-	if tag in {'NNP','NNS','NN', 'NNPS'}:
-		nouncounter = nouncounter + 1
-		nounlist.append(word)
-nouns_statistic = f'{nouncounter} Nouns'
-#shows the amount of nouns in the text
-print(nouns_statistic)
-print(nounlist)
-verblist2 = verblist
-nounlist2 = nounlist
-verb_noun_dictionary = {}
-for i in range (len(verblist2)):
-	verb_noun_dictionary[verblist2[i]] = nounlist2 [i]
-verblist_and_nounlists = zip (verblist2, nounlist2)
-verb_noun_dictionary = dict(verblist_and_nounlists)
-verblist_and_nounlists = dict(zip(verblist2, nounlist2))
-print(verblist_and_nounlists)
-print(len(verblist))
-characters = [words]
-#print(words)
-'''from itertools import groupby
-def n_letter_dictionary(string):
-    result = {}
-    for key, group in groupby(sorted(string.split(), key = lambda x: len(x)), lambda x: len(x)):
-        result[key] = list(group)
-    return result
- print(n_letter_dictionary)'''
-adverbscounter = 0
-adverblist = []
-for word, tag in statistic3:
-	if tag in {'RB','RBR','RBS','WRB'}:
-		adverbscounter = adverbscounter + 1
-		adverblist.append(word)
-adverb_statistic = f'{adverbscounter} Adverbs'
-#shows the amount of adverbs in the text
-print(adverb_statistic)
-print(adverblist)
-pronounscounter = 0
-pronounslist = []
-for word, tag in statistic3:
-	if tag in {'PRP','PRP$'}:
-		pronounscounter = pronounscounter + 1
-		pronounslist.append(word)
-pronoun_statistic = f'{pronounscounter} Pronouns'
-#shows the amount of pronouns in the text
-print(pronoun_statistic)
-print(pronounslist)
-adjectivscounter = 0
-adjectivslist = []
-for word, tag in statistic3:
-	if tag in {'JJ','JJR','JJS'}:
-		adjectivscounter = adjectivscounter + 1
-		adjectivslist.append(word)
-adjectiv_statistic = f'{adjectivscounter} Adjectives'
-#shows the amount of adjectives in the text
-print(adjectiv_statistic)
-print(adjectivslist)
-coordinating_conjuction_counter = 0
-for word, tag in statistic3:
-	if tag in {'CC'}:
-		coordinating_conjuction_counter = coordinating_conjuction_counter + 1
-coordinating_conjuction_statistic = f'{coordinating_conjuction_counter} Coordinating conjuctions'
-#shows the amount of coordinating_conjuction in the text
-print(coordinating_conjuction_statistic)
-cardinal_number = 0
-for word, tag in statistic3:
-	if tag in {'CC'}:
-		cardinal_number = cardinal_number + 1
-cardinal_number_statistic = f'{cardinal_number} Cardinal numbers'
-#shows the amount of cardinal_number in the text
-print(cardinal_number_statistic)
-determiner_counter = 0
-for word, tag in statistic3:
-	if tag in {'D'}:
-		determiner_counter = determiner_counter + 1
-determiner_statistic = f'{determiner_counter} Determiners'
-#shows the amount of Determiners in the text
-print(determiner_statistic)
-existential_there_counter = 0
-for word, tag in statistic3:
-	if tag in {'EX'}:
-		existential_there_counter = existential_there_counter + 1
-existential_there_statistic = f'{existential_there_counter} Existential there'
-#shows the amount of Existential there in the text
-print(existential_there_statistic)
-foreing_words_counter = 0
-for word, tag in statistic3:
-	if tag in {'FW'}:
-		foreing_words_counter = foreing_words_counter + 1
-foreing_words_statistic = f'{foreing_words_counter} Foreing words'
-#shows the amount of foreing words in the text
-print(foreing_words_statistic)
-preposition_or_subordinating_conjunctionlist = []
-preposition_or_subordinating_conjunction_counter = 0
-for word, tag in statistic3:
-	if tag in {'IN'}:
-		preposition_or_subordinating_conjunction_counter = preposition_or_subordinating_conjunction_counter + 1
-		preposition_or_subordinating_conjunctionlist.append(word)
-preposition_or_subordinating_conjunction_statistic = f'{preposition_or_subordinating_conjunction_counter} Preposition or subordinating conjunctions'
-#shows the amount of preposition_or_subordinating_conjunction in the text
-print(preposition_or_subordinating_conjunction_statistic)
-print(preposition_or_subordinating_conjunctionlist)
-list_item_marker_counter = 0
-for word, tag in statistic3:
-	if tag in {'LS'}:
-		list_item_marker_counter = list_item_marker_counter + 1
-list_item_marker_statistic = f'{list_item_marker_counter} List item markers'
-#shows the amount of list item markers in the text
-print(list_item_marker_statistic )
-modals_counter = 0
-for word, tag in statistic3:
-	if tag in {'LS'}:
-		modals_counter = modals_counter + 1
-modals_statistic = f'{modals_counter} Modals'
-#shows the amount of modals in the text
-print(modals_statistic)
-Predeterminer_counter = 0
-for word, tag in statistic3:
-	if tag in {'PDT'}:
-		Predeterminer_counter = Predeterminer_counter  + 1
-Predeterminer_statistic = f'{Predeterminer_counter } Predeterminers'
-#shows the amount of Predeterminers in the text
-print(Predeterminer_statistic)
-Possessive_ending_counter = 0
-for word, tag in statistic3:
-	if tag in {'PDT'}:
-		Possessive_ending_counter = Possessive_ending_counter + 1
-Possessive_ending_statistic = f'{Possessive_ending_counter} Possessive endings'
-#shows the amount of Possessive endings in the text
-print(Possessive_ending_statistic)
-particle_counter = 0
-for word, tag in statistic3:
-	if tag in {'RP'}:
-		Particle_counter = particle_counter + 1
-particle_statistic = f'{particle_counter} Particles'
-#shows the amount of Particles endings in the text
-print(particle_statistic)
-symbol_counter = 0
-for word, tag in statistic3:
-	if tag in {'SYM'}:
-		symbol_counter = symbol_counter + 1
-symbol_statistic = f'{symbol_counter} Symbols'
-#shows the amount of symbols in the text
-print(symbol_statistic)
-to_counter = 0
-for word, tag in statistic3:
-	if tag in {'TO'}:
-		to_counter = to_counter + 1
-to_statistic = f'{to_counter} to'
-#shows the amount of to in the text
-print(to_statistic)
-interjection_counter = 0
-for word, tag in statistic3:
-	if tag in {'TO'}:
-		interjection_counter = interjection_counter + 1
-interjection_statistic = f'{interjection_counter} Interjections'
-#shows the amount of interjections in the text
-print(interjection_statistic)
-Wh_determiner_counter = 0
-for word, tag in statistic3:
-	if tag in {'TO'}:
-		Wh_determiner_counter = Wh_determiner_counter + 1
-Wh_determiner_statistic = f'{Wh_determiner_counter} Wh determiners'
-#shows the amount of Wh determiners in the text
-print(Wh_determiner_statistic)
-Wh_pronoun_counter = 0
-for word, tag in statistic3:
-	if tag in {'TO'}:
-		Wh_pronoun_counter = Wh_pronoun_counter + 1
-Wh_pronoun_statistic = f'{Wh_pronoun_counter} Wh pronouns'
-#shows the amount of Wh pronouns in the text
-print(Wh_pronoun_statistic)
-Possessive_wh_pronoun_counter = 0
-for word, tag in statistic3:
-	if tag in {'TO'}:
-		Possessive_wh_pronoun_counter  = Possessive_wh_pronoun_counter  + 1
-Possessive_wh_pronoun_statistic = f'{Possessive_wh_pronoun_counter} Possessive wh pronouns'
-#shows the amount of Possessive wh pronouns in the text
-print(Possessive_wh_pronoun_statistic)
-dic1 =([len (i) for i in verblist])
-print(dic1)
-dic2=([len (i) for i in nounlist])
-print(dic2)
-dic3=([len (i) for i in adjectivslist])
-print(dic3)
-dic4=([len (i) for i in preposition_or_subordinating_conjunctionlist])
-print(dic4)
-#print([len (i) for i in verblist_and_nounlists])
-#print([len (i) for i in words])
-double_numbers1 = []
-for n in dic1:
-	double_numbers1.append(n*100)
-print(double_numbers1)
-double_numbers2 = []
-for n in dic2:
-	double_numbers2.append(n*100)
-print(double_numbers2)
-double_numbers3 = []
-for n in dic3:
-	double_numbers3.append(n*100)
-print(double_numbers3)
-double_numbers4 = []
-for n in dic4:
-	double_numbers4.append(n*100)
-print(double_numbers4)
-div_numbers1= []
-for n in dic1:
-	div_numbers1.append(n/100)
-print(div_numbers1)
-div_numbers2= []
-for n in dic2:
-	div_numbers2.append(n/100)
-print(div_numbers2)
-div_numbers3= []
-for n in dic3:
-	div_numbers3.append(n/100)
-print(div_numbers3)
-div_numbers4= []
-for n in dic4:
-	div_numbers4.append(n/100)
-print(div_numbers4)
-'''lst1 = [[double_numbers1], [double_numbers2], [double_numbers3], [double_numbers4]]
-print((zip(*lst1))[0])'''
-'''lst1 = [[double_numbers1], [double_numbers2], [double_numbers3], [double_numbers4]]
-lst2 = []
-lst2.append([x[0]for x in lst1])
-print(lst2 [0])'''
-'''lst1 = [[double_numbers1], [double_numbers2], [double_numbers3], [double_numbers4]]
-outputlist = []
-for values in lst1:
-	outputlist.append(values[-1])
-print(outputlist)'''
-n1 = double_numbers1
-n1_a = (n1[0])
-print(n1_a)
-n2 = double_numbers2
-#print(n2[0])
-n3 = double_numbers3
-#print(n3[0])
-n4 = double_numbers4
-#print(n4[0])
-n5 = double_numbers1
-#print(n5[1])
-n6 = double_numbers2
-#print(n6[1])
-n7 = double_numbers3
-#print(n7[1])
-n8 = double_numbers3
-#print(n8[1])
-print((n1[0], n2[0]), (n3[0], n4[0]), (n5[1], n6[1]), (n7[1], n8[1]))
-n1a = div_numbers1
-#print(n1a[0])
-n2a = div_numbers2
-#print(n2a[0])
-n3a = div_numbers3
-#print(n3a[0])
-n4a = div_numbers4
-#print(n4a[0])
-print(n1a[0], n2a[0], n3a[0], n4a[0])
-text_file = open ("Output.txt", "w")
-text_file.write(n1_a)
-text_file.close()
-wordsnumber_statistic = len(content.split())
-#number of words
-#print(wordsnumber_statistic)
-numberoflines_statistic = len(content.splitlines())
-#number of lines
-print("Number of lines:")
-print(numberoflines_statistic)
-numberofcharacters_statistic = len(content)
-#number of characters
-print("Number of characters:")
-print(numberofcharacters_statistic)
-d ={}
-for word in words:
-	d[word] = d.get(word, 0) + 1
-#how many times a word accuers in the text, not sorted yet(next step)
-#print(d)
-word_freq =[]
-for key, value in d.items():
-	word_freq.append((value, key))
-#sorted the word count - converting a dictionary into a list
-#print(word_freq)
-lettercounter = Counter(content)
-#counts the letters in the text
-#print(lettercounter)</code>