User:Angeliki/2nd Trimester: Difference between revisions
No edit summary |
|||
Line 23: | Line 23: | ||
== Python scripts == | == Python scripts == | ||
Python whisperer | Python whisperer | ||
<syntaxhighlight lang="python" line='line'> | |||
import nltk | |||
import collections | |||
import random | |||
import sys | |||
from sys import stdin, stderr, stdout | |||
o = open("Synopsis_24012018.txt", 'r') | |||
original = o.read() | |||
tokens = nltk.word_tokenize(original) | |||
for noun in tokens: | |||
noun = noun.lower() | |||
# print (tokens) | |||
v = open("nouns/91K nouns.txt") | |||
nouns = v.read() | |||
tokens_nouns = nltk.word_tokenize(nouns) | |||
# print (tokens_nouns) | |||
newnouns = [] | |||
for word in tokens: | |||
if word in tokens_nouns: | |||
n=tokens_nouns.index(word) | |||
# # print (n) | |||
newnouns.append(tokens_nouns[n]) | |||
# # print (newnouns) | |||
filename = 'Audiosfera-2015-Westerkamp.txt' | |||
vocabulary = [] | |||
vocabulary_size = 1000 | |||
def read_input_text(filename): | |||
txtfile = open(filename, 'r') | |||
string = txtfile.read() | |||
words = nltk.word_tokenize(string) | |||
# print (words) | |||
for word in words: | |||
word=word.lower() | |||
vocabulary.append(word) | |||
# print('Data size:', len(vocabulary)) | |||
read_input_text(filename) | |||
# print(vocabulary) | |||
newsynopsis = [] | |||
for word in vocabulary: | |||
if word in tokens_nouns: | |||
newsynopsis.append(random.choice(newnouns)) | |||
else: | |||
newsynopsis.append(word) | |||
print (" ".join(newsynopsis)) | |||
</syntaxhighlight> |
Revision as of 00:04, 30 January 2018
OCR
Tesseract training:
1. Install Tesseract
2. Recipe for training
Reading- Writing
Reader
Python scripts
Python whisperer
import nltk
import collections
import random
import sys
from sys import stdin, stderr, stdout
o = open("Synopsis_24012018.txt", 'r')
original = o.read()
tokens = nltk.word_tokenize(original)
for noun in tokens:
noun = noun.lower()
# print (tokens)
v = open("nouns/91K nouns.txt")
nouns = v.read()
tokens_nouns = nltk.word_tokenize(nouns)
# print (tokens_nouns)
newnouns = []
for word in tokens:
if word in tokens_nouns:
n=tokens_nouns.index(word)
# # print (n)
newnouns.append(tokens_nouns[n])
# # print (newnouns)
filename = 'Audiosfera-2015-Westerkamp.txt'
vocabulary = []
vocabulary_size = 1000
def read_input_text(filename):
txtfile = open(filename, 'r')
string = txtfile.read()
words = nltk.word_tokenize(string)
# print (words)
for word in words:
word=word.lower()
vocabulary.append(word)
# print('Data size:', len(vocabulary))
read_input_text(filename)
# print(vocabulary)
newsynopsis = []
for word in vocabulary:
if word in tokens_nouns:
newsynopsis.append(random.choice(newnouns))
else:
newsynopsis.append(word)
print (" ".join(newsynopsis))