Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
likorn
/
estonian_verbs
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
3c6d5faf
authored
Dec 15, 2018
by
Paktalin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
K means works
parent
f33b21f6
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
39 additions
and
11 deletions
__pycache__/k_means.cpython-36.pyc
estnltk_preprocessing.py
k_means.py
__pycache__/k_means.cpython-36.pyc
View file @
3c6d5faf
No preview for this file type
estnltk_preprocessing.py
View file @
3c6d5faf
...
...
@@ -4,6 +4,7 @@ from k_means import plot_k_means
import
pandas
as
pd
import
matplotlib.pyplot
as
plt
import
numpy
as
np
np
.
set_printoptions
(
formatter
=
{
'float'
:
lambda
x
:
"{0:0.3f}"
.
format
(
x
)})
def
map_verbs_with_sentences
():
verbs
=
{}
...
...
@@ -56,6 +57,7 @@ def add_value_to_dict(value, dictionary, distance):
dictionary
[
value
]
=
0
dictionary
[
value
]
+=
1
/
distance
def
construct_df_of_verbs
(
initial_df
):
verbs
=
load_dict
(
'verbs_dict'
)
rows
=
[]
...
...
@@ -79,6 +81,7 @@ def construct_df_of_verbs(initial_df):
def
transform_df_to_preprocessed_array
(
df
):
# divide by the number of samples
X
=
df
.
drop
([
'verb'
,
'number_of_samples'
],
axis
=
1
)
remove_unpopular_features
(
X
)
columns
=
X
.
columns
X
=
X
.
values
number_of_samples
=
df
[
'number_of_samples'
]
.
values
...
...
@@ -87,11 +90,30 @@ def transform_df_to_preprocessed_array(df): # divide by the number of samples
X
=
X
/
number_of_samples
return
X
,
columns
def
remove_unpopular_features
(
df
):
df
=
df
.
drop
([
'b|vad'
,
'gu'
,
'neg ks'
,
'neg me'
,
'neg nud'
,
'neg o'
,
'neg vat'
],
axis
=
1
)
# print(df[df['ksite'] != 0]['ksite'])
# print(df[df['neg ge'] != 0]['neg ge'])
# print(df[df['nud'] != 0]['nud'])
print
(
df
[
df
[
'nuks'
]
!=
0
][
'nuks'
])
# print(df['nuksin'])
# print(df['tav'])
# print(df['tud'])
# print(df['v'])
# print(df['Unnamed: 84'])
# print(df['neg gem'])
# print(df['n|sin'])
# print(df['tavat|vat'])
# print(df['tama'])
# print(df['me|sime'])
# print(df['tav|v'])
df
=
read_csv
(
'verbs.csv'
,
sep
=
'~'
,
header
=
0
)
X
,
columns
=
transform_df_to_preprocessed_array
(
df
)
K
=
5
plot_k_means
(
X
,
K
,
columns
)
# K = 5
# plot_k_means(X, K, columns)
# df = read_csv('cleaned_dataframe.csv', sep='~')
# df.columns = ['distance', 'noun_like', 'noun_like_form', 'noun_like_pos', 'sentence', 'verb', 'verbs_form']
# construct_df_of_verbs(df)
\ No newline at end of file
k_means.py
View file @
3c6d5faf
...
...
@@ -14,29 +14,34 @@ def cost(X, R, M):
cost
+=
(
R
[:,
k
]
*
sq_distances
)
.
sum
()
return
cost
def
plot_k_means
(
X
,
K
,
columns
,
max_iter
=
20
,
beta
=
1.0
,
show_plots
=
True
):
N
,
D
=
X
.
shape
M
=
np
.
zeros
((
K
,
D
))
# means
exponents
=
np
.
empty
((
N
,
K
))
R
=
np
.
zeros
((
N
,
K
))
for
k
in
range
(
K
):
M
[
k
]
=
X
[
np
.
random
.
choice
(
N
)]
costs
=
np
.
zeros
(
max_iter
)
for
i
in
range
(
max_iter
):
for
k
in
range
(
K
):
for
n
in
range
(
N
):
exponents
[
n
,
k
]
=
np
.
exp
(
-
beta
*
d
(
M
[
k
],
X
[
n
]))
min_distance
=
d
(
X
[
n
],
M
[
0
])
min_k
=
0
for
k
in
range
(
K
):
if
d
(
X
[
n
],
M
[
k
])
<
min_distance
:
min_distance
=
d
(
X
[
n
],
M
[
k
])
min_k
=
k
R
[
n
,:]
=
0
R
[
n
,
min_k
]
=
1
R
=
exponents
/
exponents
.
sum
(
axis
=
1
,
keepdims
=
True
)
for
k
in
range
(
K
):
M
[
k
]
=
R
[:,
k
]
.
dot
(
X
)
/
R
[:,
k
]
.
sum
()
costs
[
i
]
=
cost
(
X
,
R
,
M
)
#
if i > 0:
#
if np.abs(costs[i] - costs[i-1]) < 1e-5:
#
break
if
i
>
0
:
if
np
.
abs
(
costs
[
i
]
-
costs
[
i
-
1
])
<
1e-5
:
break
if
show_plots
:
plt
.
plot
(
costs
)
...
...
@@ -47,7 +52,7 @@ def plot_k_means(X, K, columns, max_iter=20, beta=1.0, show_plots=True):
colors
=
R
.
dot
(
random_colors
)
for
i
in
range
(
X
.
shape
[
0
]
-
1
):
for
j
in
range
(
i
+
1
,
X
.
shape
[
0
]
-
1
):
plt
.
scatter
(
X
[:,
i
],
X
[:,
j
],
c
=
colors
)
plt
.
scatter
(
X
[:,
i
],
X
[:,
j
],
c
=
colors
,
s
=
7
,
alpha
=
0.9
)
plt
.
xlabel
(
columns
[
i
])
plt
.
ylabel
(
columns
[
j
])
plt
.
show
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment