Current version of the project

355e205f · Paktalin · c282d622 · 355e205f · 355e205f
Commit 355e205f authored Jan 12, 2019 by Paktalin
Showing with 30 additions and 12 deletions
main.py
preprocessing.py
--- a/main.py
+++ b/main.py
@@ -5,10 +5,10 @@ from util import read_list, read_array
 from keras.utils import to_categorical
 import numpy as np
-VOCAB_SIZE = 79
+VOCAB_SIZE = 85
 def get_train_test_val():
-	sequences = read_array('sequences_na.csv')
+	sequences = read_array('sequences_splitted.csv')
 	X, y = sequences[:,:-1], sequences[:,-1]
 	y = to_categorical(y, num_classes=VOCAB_SIZE)
 	x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
@@ -55,6 +55,7 @@ def train_the_last_word_model():
 	x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
 	while(True):
 		model = load_model(model_name(epoch))
+		print(model.summary())
 		train_history = model.fit(x_train, y_train, epochs=1, batch_size=2048, validation_data=(x_validate, y_validate))
 		val_loss = train_history.history['val_loss'][-1]
 		if val_loss < previous_loss:

--- a/preprocessing.py
+++ b/preprocessing.py
@@ -6,6 +6,7 @@ import pickle, re
 from util import save_list, read_list, save_array, read_array
 from keras.preprocessing.sequence import pad_sequences
 from keras.utils import to_categorical
+import matplotlib.pyplot as plt
 # set sequence length and step for sentences splitting
 SEQUENCE_LEN = 3
@@ -97,15 +98,14 @@ def save_forms_and_sequences():
 		sentence = text_to_word_sequence(sentences[i])
 		for word in sentence:
 			form = Text(word).forms[0]
-			if '|' in form or '?' in form:
-				forms_string = 'ambiguous'
-				break
 			if form == '':
 				form = ' '
-			forms_string = forms_string + '~' + form
+			if '|' in form or '?' in form:
-		if forms_string != 'ambiguous':
+				forms.append(forms_string)
-			forms.append(forms_string)
+				forms_string = ''
+			else:
+				forms_string = forms_string + '~' + form
+		forms.append(forms_string)
 	save_list(forms, forms_file)
 	# tokenize the forms
@@ -117,6 +117,23 @@ def save_forms_and_sequences():
 	sequences = np.array(sequences)
 	save_array(sequences, 'sequences_na.csv') # not ambiguous
-save_forms_and_sequences()
+forms = read_list(forms_file)
+tokenizer = Tokenizer(split='~', filters='')
+tokenizer.fit_on_texts(forms)
+sequences = tokenizer.texts_to_sequences(forms)
+minlen = 3
+lengths = []
+for sequence in sequences:
+	if len(sequence) < 3:
+		sequences.remove(sequence)
+	else:
+		lengths.append(len(sequence))
+plt.hist(lengths, bins=100)
+plt.show()
-# print(read_list(forms_file))
+sequences = pad_sequences(sequences, 40)
\ No newline at end of file
+sequences = np.array(sequences)
+save_array(sequences, 'sequences_splitted.csv')
\ No newline at end of file