Commit f5c0f0ab by Paktalin

preprocessing with MAXLEN. Has to be rewritten

parent f334d8dd
import numpy as np import numpy as np
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from tqdm import tqdm
from keras.models import Sequential from keras.models import Sequential
from keras.layers import Bidirectional, Dense, Activation, LSTM, Dropout from keras.layers import Bidirectional, Dense, Activation, LSTM, Dropout
import pickle from preprocessing import read_sequences, read_next_words, SEQUENCE_LEN
# load the input array
sentences = np.genfromtxt('encoded_forms.csv', delimiter='~')
# set sequence length and step for sentences splitting
SEQUENCE_LEN = 3
STEP = 1
forms = 114 forms = 114
batch_size = 128 batch_size = 128
# create ampty lists
sequences = []
next_words = []
# set sequences and next_words (x, y) # read sequences and next words from files
for i in tqdm(range(len(sentences))): sequences = read_sequences()
sentence = sentences[i] next_words = read_next_words()
# loop over each sentence splitting it into sequences
for j in range(0, len(sentence) - SEQUENCE_LEN, STEP):
# split the sentences into sequences of SEQUENCE_LEN
sequences.append(sentence[j: j + SEQUENCE_LEN])
# set next words for the current sequence
next_words.append(sentence[j + SEQUENCE_LEN])
#save the lists
with open('sequences', 'wb') as fp:
pickle.dump(sequences, fp)
with open('next_words', 'wb') as fp:
pickle.dump(next_words, fp)
# split training and test sets # split training and test sets
print('Splitting test and training sets...')
x_train, x_test, y_train, y_test = train_test_split(sequences, next_words, test_size=0.33) x_train, x_test, y_train, y_test = train_test_split(sequences, next_words, test_size=0.33)
x_train, x_test = np.array(x_train), np.array(x_test)
print(x_train[0])
print('Defining the model...')
dropout = 0.2 dropout = 0.2
model = Sequential() model = Sequential()
model.add(Bidirectional(LSTM(128), input_shape=(SEQUENCE_LEN, forms))) model.add(Bidirectional(LSTM(128), input_shape=(SEQUENCE_LEN, forms)))
...@@ -43,6 +27,9 @@ if dropout > 0: ...@@ -43,6 +27,9 @@ if dropout > 0:
model.add(Dense(forms)) model.add(Dense(forms))
model.add(Activation('softmax')) model.add(Activation('softmax'))
print('Compiling the model...')
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=15, validation_data=(x_test, y_test)) print('Fitting the data...')
model.fit(x_train, y_train, batch_size=batch_size, epochs=15)
print('Saving the model...')
model.save('lstm.h5') model.save('lstm.h5')
\ No newline at end of file
File added
...@@ -2,28 +2,71 @@ from estnltk import Text ...@@ -2,28 +2,71 @@ from estnltk import Text
import numpy as np import numpy as np
from keras.preprocessing.text import text_to_word_sequence from keras.preprocessing.text import text_to_word_sequence
from tqdm import tqdm from tqdm import tqdm
import pickle
# the maximum length of a sentence # the maximum length of a sentence
maxlen = 70 MAXLEN = 70
# load data # set sequence length and step for sentences splitting
articles = Text(open('articles.txt', encoding='utf-8').read()) SEQUENCE_LEN = 3
# transform to an array of sentences STEP = 1
sentences = articles.sentence_texts
# create an empty dict to store forms like {form: code} articles_file = 'articles.txt'
dict_forms = {} encoded_forms_file = 'encoded_forms.csv'
# initialize a prefilled with zeros numpy array next_words_file = 'next_words'
encoded_forms = np.zeros((len(sentences), maxlen), dtype=int) sequences_file = 'sequences'
# loop over all sentences showing a loading bar
for i in tqdm(range(len(sentences))): def encode_forms():
# load data
articles = Text(open(articles_file, encoding='utf-8').read())
# transform to an array of sentences
sentences = articles.sentence_texts
# create an empty dict to store forms like {form: code}
dict_forms = {}
# initialize a prefilled with zeros numpy array
encoded_forms = np.zeros((len(sentences), MAXLEN), dtype=int)
# loop over all sentences showing a loading bar
for i in tqdm(range(len(sentences))):
# split the sentence into a list of lowercase words # split the sentence into a list of lowercase words
sentences[i] = text_to_word_sequence(sentences[i]) sentences[i] = text_to_word_sequence(sentences[i])
# loop over the words in the current sentence # loop over the words in the current sentence
for j in range(len(sentences[i][:maxlen])): for j in range(len(sentences[i][:MAXLEN])):
form = Text(sentences[i][j]).forms[0] form = Text(sentences[i][j]).forms[0]
# add the unseen form to the dictionary increasing its code value by one # add the unseen form to the dictionary increasing its code value by one
if form not in dict_forms: if form not in dict_forms:
dict_forms[form] = len(dict_forms) + 1 dict_forms[form] = len(dict_forms) + 1
# set the form's code to the current form # set the form's code to the current form
encoded_forms[i,j] = dict_forms[form] encoded_forms[i,j] = dict_forms[form]
np.savetxt("encoded_forms.csv", encoded_forms, delimiter="~", fmt='%i') np.savetxt("encoded_forms.csv", encoded_forms, delimiter="~", fmt='%i')
\ No newline at end of file
def set_sequences_and_new_words():
# create ampty lists
sequences = []
next_words = []
# load the input array of encoded forms
sentences = np.genfromtxt(encoded_forms_file, delimiter='~')
for i in tqdm(range(len(sentences))):
sentence = sentences[i]
# loop over each sentence splitting it into sequences
for j in range(0, len(sentence) - SEQUENCE_LEN, STEP):
# split the sentences into sequences of SEQUENCE_LEN
sequences.append(sentence[j: j + SEQUENCE_LEN])
# set next words for the current sequence
next_words.append(sentence[j + SEQUENCE_LEN])
#save the lists
print('Saving sequences...')
with open(sequences_file, 'wb') as fp:
pickle.dump(sequences, fp)
print('Saving next_words...')
with open(next_words_file, 'wb') as fp:
pickle.dump(next_words, fp)
def read_sequences():
with open (sequences_file, 'rb') as fp:
sequences = pickle.load(fp)
return sequences
def read_next_words():
with open (next_words_file, 'rb') as fp:
next_words = pickle.load(fp)
return next_words
\ No newline at end of file
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment