Commit 3c6d5faf by Paktalin

K means works

parent f33b21f6
...@@ -4,6 +4,7 @@ from k_means import plot_k_means ...@@ -4,6 +4,7 @@ from k_means import plot_k_means
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
def map_verbs_with_sentences(): def map_verbs_with_sentences():
verbs = {} verbs = {}
...@@ -56,6 +57,7 @@ def add_value_to_dict(value, dictionary, distance): ...@@ -56,6 +57,7 @@ def add_value_to_dict(value, dictionary, distance):
dictionary[value] = 0 dictionary[value] = 0
dictionary[value] += 1 / distance dictionary[value] += 1 / distance
def construct_df_of_verbs(initial_df): def construct_df_of_verbs(initial_df):
verbs = load_dict('verbs_dict') verbs = load_dict('verbs_dict')
rows = [] rows = []
...@@ -79,6 +81,7 @@ def construct_df_of_verbs(initial_df): ...@@ -79,6 +81,7 @@ def construct_df_of_verbs(initial_df):
def transform_df_to_preprocessed_array(df): # divide by the number of samples def transform_df_to_preprocessed_array(df): # divide by the number of samples
X = df.drop(['verb', 'number_of_samples'], axis=1) X = df.drop(['verb', 'number_of_samples'], axis=1)
remove_unpopular_features(X)
columns = X.columns columns = X.columns
X = X.values X = X.values
number_of_samples = df['number_of_samples'].values number_of_samples = df['number_of_samples'].values
...@@ -87,11 +90,30 @@ def transform_df_to_preprocessed_array(df): # divide by the number of samples ...@@ -87,11 +90,30 @@ def transform_df_to_preprocessed_array(df): # divide by the number of samples
X = X / number_of_samples X = X / number_of_samples
return X, columns return X, columns
def remove_unpopular_features(df):
df = df.drop(['b|vad', 'gu', 'neg ks', 'neg me', 'neg nud', 'neg o', 'neg vat'], axis=1)
# print(df[df['ksite'] != 0]['ksite'])
# print(df[df['neg ge'] != 0]['neg ge'])
# print(df[df['nud'] != 0]['nud'])
print(df[df['nuks'] != 0]['nuks'])
# print(df['nuksin'])
# print(df['tav'])
# print(df['tud'])
# print(df['v'])
# print(df['Unnamed: 84'])
# print(df['neg gem'])
# print(df['n|sin'])
# print(df['tavat|vat'])
# print(df['tama'])
# print(df['me|sime'])
# print(df['tav|v'])
df = read_csv('verbs.csv', sep='~', header=0) df = read_csv('verbs.csv', sep='~', header=0)
X, columns = transform_df_to_preprocessed_array(df) X, columns = transform_df_to_preprocessed_array(df)
K = 5 # K = 5
plot_k_means(X, K, columns) # plot_k_means(X, K, columns)
# df = read_csv('cleaned_dataframe.csv', sep='~') # df = read_csv('cleaned_dataframe.csv', sep='~')
# df.columns = ['distance', 'noun_like', 'noun_like_form', 'noun_like_pos', 'sentence', 'verb', 'verbs_form'] # df.columns = ['distance', 'noun_like', 'noun_like_form', 'noun_like_pos', 'sentence', 'verb', 'verbs_form']
# construct_df_of_verbs(df) # construct_df_of_verbs(df)
\ No newline at end of file
...@@ -14,29 +14,34 @@ def cost(X, R, M): ...@@ -14,29 +14,34 @@ def cost(X, R, M):
cost += (R[:,k] * sq_distances).sum() cost += (R[:,k] * sq_distances).sum()
return cost return cost
def plot_k_means(X, K, columns, max_iter=20, beta=1.0, show_plots=True): def plot_k_means(X, K, columns, max_iter=20, beta=1.0, show_plots=True):
N, D = X.shape N, D = X.shape
M = np.zeros((K, D)) # means M = np.zeros((K, D)) # means
exponents = np.empty((N, K)) R = np.zeros((N, K))
for k in range(K): for k in range(K):
M[k] = X[np.random.choice(N)] M[k] = X[np.random.choice(N)]
costs = np.zeros(max_iter) costs = np.zeros(max_iter)
for i in range(max_iter): for i in range(max_iter):
for k in range(K):
for n in range(N):
exponents[n,k] = np.exp(-beta*d(M[k], X[n]))
R = exponents / exponents.sum(axis=1, keepdims=True) for n in range(N):
min_distance = d(X[n], M[0])
min_k = 0
for k in range(K):
if d(X[n], M[k]) < min_distance:
min_distance = d(X[n], M[k])
min_k = k
R[n,:] = 0
R[n,min_k] = 1
for k in range(K): for k in range(K):
M[k] = R[:,k].dot(X) / R[:,k].sum() M[k] = R[:,k].dot(X) / R[:,k].sum()
costs[i] = cost(X, R, M) costs[i] = cost(X, R, M)
# if i > 0: if i > 0:
# if np.abs(costs[i] - costs[i-1]) < 1e-5: if np.abs(costs[i] - costs[i-1]) < 1e-5:
# break break
if show_plots: if show_plots:
plt.plot(costs) plt.plot(costs)
...@@ -47,7 +52,7 @@ def plot_k_means(X, K, columns, max_iter=20, beta=1.0, show_plots=True): ...@@ -47,7 +52,7 @@ def plot_k_means(X, K, columns, max_iter=20, beta=1.0, show_plots=True):
colors = R.dot(random_colors) colors = R.dot(random_colors)
for i in range(X.shape[0]-1): for i in range(X.shape[0]-1):
for j in range(i + 1, X.shape[0]-1): for j in range(i + 1, X.shape[0]-1):
plt.scatter(X[:,i], X[:,j], c=colors) plt.scatter(X[:,i], X[:,j], c=colors, s=7, alpha=0.9)
plt.xlabel(columns[i]) plt.xlabel(columns[i])
plt.ylabel(columns[j]) plt.ylabel(columns[j])
plt.show() plt.show()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment