User:Angeliki/2nd Trimester: Difference between revisions

Revision as of 01:04, 30 January 2018

OCR

Tesseract training:

1. Install Tesseract
2. Recipe for training

Reading- Writing

Synopsis

Reader

Mini reader

Python scripts

Python whisperer

import nltk
import collections
import random
import sys

from sys import stdin, stderr, stdout

o = open("Synopsis_24012018.txt", 'r')
original = o.read()
tokens = nltk.word_tokenize(original)
for noun in tokens:
	noun = noun.lower()
# print (tokens)
v = open("nouns/91K nouns.txt")
nouns = v.read()
tokens_nouns = nltk.word_tokenize(nouns)
# print (tokens_nouns)
newnouns = []
for word in tokens:
	if word in tokens_nouns:
		n=tokens_nouns.index(word)
# 		# print (n)
		newnouns.append(tokens_nouns[n])
# 		# print (newnouns)

filename = 'Audiosfera-2015-Westerkamp.txt'
vocabulary = []

vocabulary_size = 1000
def read_input_text(filename):
    txtfile = open(filename, 'r') 
    string = txtfile.read()
    words = nltk.word_tokenize(string)
    # print (words)
    for word in words:
    	word=word.lower()
    	vocabulary.append(word)
    # print('Data size:', len(vocabulary))                

read_input_text(filename)
# print(vocabulary)

newsynopsis = []
for word in vocabulary:
	if word in tokens_nouns:
		newsynopsis.append(random.choice(newnouns))
	else:
		newsynopsis.append(word)
print (" ".join(newsynopsis))