User:Angeliki/2nd Trimester
OCR
Tesseract training:
1. Install Tesseract
2. Recipe for training
Reading- Writing
Reader
Python scripts
Python whisperer
import nltk
import collections
import random
import sys
from sys import stdin, stderr, stdout
o = open("Synopsis_24012018.txt", 'r')
original = o.read()
tokens = nltk.word_tokenize(original)
for noun in tokens:
noun = noun.lower()
# print (tokens)
v = open("nouns/91K nouns.txt")
nouns = v.read()
tokens_nouns = nltk.word_tokenize(nouns)
# print (tokens_nouns)
newnouns = []
for word in tokens:
if word in tokens_nouns:
n=tokens_nouns.index(word)
# # print (n)
newnouns.append(tokens_nouns[n])
# # print (newnouns)
filename = 'Audiosfera-2015-Westerkamp.txt'
vocabulary = []
vocabulary_size = 1000
def read_input_text(filename):
txtfile = open(filename, 'r')
string = txtfile.read()
words = nltk.word_tokenize(string)
# print (words)
for word in words:
word=word.lower()
vocabulary.append(word)
# print('Data size:', len(vocabulary))
read_input_text(filename)
# print(vocabulary)
newsynopsis = []
for word in vocabulary:
if word in tokens_nouns:
newsynopsis.append(random.choice(newnouns))
else:
newsynopsis.append(word)
print (" ".join(newsynopsis))