Commit 355e205f by Paktalin

Current version of the project

parent c282d622
Showing with 30 additions and 12 deletions
...@@ -5,10 +5,10 @@ from util import read_list, read_array ...@@ -5,10 +5,10 @@ from util import read_list, read_array
from keras.utils import to_categorical from keras.utils import to_categorical
import numpy as np import numpy as np
VOCAB_SIZE = 79 VOCAB_SIZE = 85
def get_train_test_val(): def get_train_test_val():
sequences = read_array('sequences_na.csv') sequences = read_array('sequences_splitted.csv')
X, y = sequences[:,:-1], sequences[:,-1] X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=VOCAB_SIZE) y = to_categorical(y, num_classes=VOCAB_SIZE)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2) x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
...@@ -55,6 +55,7 @@ def train_the_last_word_model(): ...@@ -55,6 +55,7 @@ def train_the_last_word_model():
x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val() x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
while(True): while(True):
model = load_model(model_name(epoch)) model = load_model(model_name(epoch))
print(model.summary())
train_history = model.fit(x_train, y_train, epochs=1, batch_size=2048, validation_data=(x_validate, y_validate)) train_history = model.fit(x_train, y_train, epochs=1, batch_size=2048, validation_data=(x_validate, y_validate))
val_loss = train_history.history['val_loss'][-1] val_loss = train_history.history['val_loss'][-1]
if val_loss < previous_loss: if val_loss < previous_loss:
......
...@@ -6,6 +6,7 @@ import pickle, re ...@@ -6,6 +6,7 @@ import pickle, re
from util import save_list, read_list, save_array, read_array from util import save_list, read_list, save_array, read_array
from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical from keras.utils import to_categorical
import matplotlib.pyplot as plt
# set sequence length and step for sentences splitting # set sequence length and step for sentences splitting
SEQUENCE_LEN = 3 SEQUENCE_LEN = 3
...@@ -97,15 +98,14 @@ def save_forms_and_sequences(): ...@@ -97,15 +98,14 @@ def save_forms_and_sequences():
sentence = text_to_word_sequence(sentences[i]) sentence = text_to_word_sequence(sentences[i])
for word in sentence: for word in sentence:
form = Text(word).forms[0] form = Text(word).forms[0]
if '|' in form or '?' in form:
forms_string = 'ambiguous'
break
if form == '': if form == '':
form = ' ' form = ' '
forms_string = forms_string + '~' + form if '|' in form or '?' in form:
if forms_string != 'ambiguous': forms.append(forms_string)
forms.append(forms_string) forms_string = ''
else:
forms_string = forms_string + '~' + form
forms.append(forms_string)
save_list(forms, forms_file) save_list(forms, forms_file)
# tokenize the forms # tokenize the forms
...@@ -117,6 +117,23 @@ def save_forms_and_sequences(): ...@@ -117,6 +117,23 @@ def save_forms_and_sequences():
sequences = np.array(sequences) sequences = np.array(sequences)
save_array(sequences, 'sequences_na.csv') # not ambiguous save_array(sequences, 'sequences_na.csv') # not ambiguous
save_forms_and_sequences() forms = read_list(forms_file)
tokenizer = Tokenizer(split='~', filters='')
tokenizer.fit_on_texts(forms)
sequences = tokenizer.texts_to_sequences(forms)
minlen = 3
lengths = []
for sequence in sequences:
if len(sequence) < 3:
sequences.remove(sequence)
else:
lengths.append(len(sequence))
plt.hist(lengths, bins=100)
plt.show()
# print(read_list(forms_file)) sequences = pad_sequences(sequences, 40)
\ No newline at end of file sequences = np.array(sequences)
save_array(sequences, 'sequences_splitted.csv')
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment