Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
likorn
/
estonian-lstm
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
355e205f
authored
Jan 12, 2019
by
Paktalin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Current version of the project
parent
c282d622
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
30 additions
and
12 deletions
main.py
preprocessing.py
main.py
View file @
355e205f
...
@@ -5,10 +5,10 @@ from util import read_list, read_array
...
@@ -5,10 +5,10 @@ from util import read_list, read_array
from
keras.utils
import
to_categorical
from
keras.utils
import
to_categorical
import
numpy
as
np
import
numpy
as
np
VOCAB_SIZE
=
79
VOCAB_SIZE
=
85
def
get_train_test_val
():
def
get_train_test_val
():
sequences
=
read_array
(
'sequences_
na
.csv'
)
sequences
=
read_array
(
'sequences_
splitted
.csv'
)
X
,
y
=
sequences
[:,:
-
1
],
sequences
[:,
-
1
]
X
,
y
=
sequences
[:,:
-
1
],
sequences
[:,
-
1
]
y
=
to_categorical
(
y
,
num_classes
=
VOCAB_SIZE
)
y
=
to_categorical
(
y
,
num_classes
=
VOCAB_SIZE
)
x_train
,
x_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.2
)
x_train
,
x_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.2
)
...
@@ -55,6 +55,7 @@ def train_the_last_word_model():
...
@@ -55,6 +55,7 @@ def train_the_last_word_model():
x_train
,
y_train
,
x_test
,
y_test
,
x_validate
,
y_validate
=
get_train_test_val
()
x_train
,
y_train
,
x_test
,
y_test
,
x_validate
,
y_validate
=
get_train_test_val
()
while
(
True
):
while
(
True
):
model
=
load_model
(
model_name
(
epoch
))
model
=
load_model
(
model_name
(
epoch
))
print
(
model
.
summary
())
train_history
=
model
.
fit
(
x_train
,
y_train
,
epochs
=
1
,
batch_size
=
2048
,
validation_data
=
(
x_validate
,
y_validate
))
train_history
=
model
.
fit
(
x_train
,
y_train
,
epochs
=
1
,
batch_size
=
2048
,
validation_data
=
(
x_validate
,
y_validate
))
val_loss
=
train_history
.
history
[
'val_loss'
][
-
1
]
val_loss
=
train_history
.
history
[
'val_loss'
][
-
1
]
if
val_loss
<
previous_loss
:
if
val_loss
<
previous_loss
:
...
...
preprocessing.py
View file @
355e205f
...
@@ -6,6 +6,7 @@ import pickle, re
...
@@ -6,6 +6,7 @@ import pickle, re
from
util
import
save_list
,
read_list
,
save_array
,
read_array
from
util
import
save_list
,
read_list
,
save_array
,
read_array
from
keras.preprocessing.sequence
import
pad_sequences
from
keras.preprocessing.sequence
import
pad_sequences
from
keras.utils
import
to_categorical
from
keras.utils
import
to_categorical
import
matplotlib.pyplot
as
plt
# set sequence length and step for sentences splitting
# set sequence length and step for sentences splitting
SEQUENCE_LEN
=
3
SEQUENCE_LEN
=
3
...
@@ -97,15 +98,14 @@ def save_forms_and_sequences():
...
@@ -97,15 +98,14 @@ def save_forms_and_sequences():
sentence
=
text_to_word_sequence
(
sentences
[
i
])
sentence
=
text_to_word_sequence
(
sentences
[
i
])
for
word
in
sentence
:
for
word
in
sentence
:
form
=
Text
(
word
)
.
forms
[
0
]
form
=
Text
(
word
)
.
forms
[
0
]
if
'|'
in
form
or
'?'
in
form
:
forms_string
=
'ambiguous'
break
if
form
==
''
:
if
form
==
''
:
form
=
' '
form
=
' '
forms_string
=
forms_string
+
'~'
+
form
if
'|'
in
form
or
'?'
in
form
:
if
forms_string
!=
'ambiguous'
:
forms
.
append
(
forms_string
)
forms
.
append
(
forms_string
)
forms_string
=
''
else
:
forms_string
=
forms_string
+
'~'
+
form
forms
.
append
(
forms_string
)
save_list
(
forms
,
forms_file
)
save_list
(
forms
,
forms_file
)
# tokenize the forms
# tokenize the forms
...
@@ -117,6 +117,23 @@ def save_forms_and_sequences():
...
@@ -117,6 +117,23 @@ def save_forms_and_sequences():
sequences
=
np
.
array
(
sequences
)
sequences
=
np
.
array
(
sequences
)
save_array
(
sequences
,
'sequences_na.csv'
)
# not ambiguous
save_array
(
sequences
,
'sequences_na.csv'
)
# not ambiguous
save_forms_and_sequences
()
forms
=
read_list
(
forms_file
)
tokenizer
=
Tokenizer
(
split
=
'~'
,
filters
=
''
)
tokenizer
.
fit_on_texts
(
forms
)
sequences
=
tokenizer
.
texts_to_sequences
(
forms
)
minlen
=
3
lengths
=
[]
for
sequence
in
sequences
:
if
len
(
sequence
)
<
3
:
sequences
.
remove
(
sequence
)
else
:
lengths
.
append
(
len
(
sequence
))
plt
.
hist
(
lengths
,
bins
=
100
)
plt
.
show
()
# print(read_list(forms_file))
sequences
=
pad_sequences
(
sequences
,
40
)
\ No newline at end of file
sequences
=
np
.
array
(
sequences
)
save_array
(
sequences
,
'sequences_splitted.csv'
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment