Commit c1ae90b8 by Paktalin

Finished plotting the clusters

parent b0ad7153
5.208333333333333044e-03~0.000000000000000000e+00~2.314814814814814686e-03~2.120580808080807858e-02~4.734848484848484980e-04~1.046187608687608635e-02~5.555555555555555768e-03~9.801136363636364299e-03~0.000000000000000000e+00~5.208333333333333044e-03~1.504629629629629546e-02~6.330787547903561630e-02~3.241840027256693291e-01~0.000000000000000000e+00~3.026294692961359421e-03
4.861804861804861801e-03~0.000000000000000000e+00~0.000000000000000000e+00~1.771341103713394705e-01~4.133161811733240300e-03~5.162661412661412666e-03~0.000000000000000000e+00~3.276554560158456003e-02~0.000000000000000000e+00~6.125818625818625976e-03~1.268338143338143369e-02~1.028388403665014272e-01~1.420068651886833427e-02~2.777777777777777450e-03~1.544011544011543967e-02
4.636903084795872959e-03~1.941169073906091934e-04~9.857873362462635930e-04~1.299108893209762958e-02~1.036641033894293573e-02~8.588813314866432236e-03~1.216400170279219838e-03~4.262528244201521654e-02~3.246485091503856918e-03~1.056740879963464011e-02~1.285236175385507870e-02~5.401369830644040665e-02~3.434420776918679225e-02~9.373483898537837613e-04~8.364202361179387882e-03
0.000000000000000000e+00~0.000000000000000000e+00~1.190476190476190410e-02~9.033956719942878855e-03~5.061115355233002447e-04~1.046493107046740298e-02~4.084967320261437490e-04~4.515398048702546491e-02~1.062091503267973899e-02~5.065359477124183225e-03~1.961102622867328879e-02~3.730394994414171217e-01~1.251335130011600738e-02~0.000000000000000000e+00~2.438475390156062250e-03
2.730978038355087560e-03~1.707650273224043714e-04~6.094997898276586796e-04~9.717531842409924542e-03~1.373039682862527838e-02~5.417899183569385713e-03~2.980625931445604116e-04~1.478773417284820202e-01~1.124121779859484777e-03~6.603083727386665699e-03~1.533876569895004441e-03~1.607589300003589572e-01~3.294602401875183079e-02~2.602133749674733279e-04~8.268645222941524431e-03
4.917822640793951400e-03~1.577441084538662214e-04~1.312211165005809773e-03~1.323998689082658137e-02~9.398589134932537961e-03~1.474209019772963584e-02~7.954321443962262010e-04~4.538150067009068950e-02~4.450182553867644900e-03~1.114021151290270524e-02~8.137564144834337140e-03~1.287223151732692639e-01~2.333622166156121280e-02~9.602919338217811305e-04~6.301141685510363438e-03
5.669793169793169337e-03~0.000000000000000000e+00~2.104377104377104285e-04~1.085487624076743839e-02~6.363984172435376355e-03~4.688904111607076694e-03~2.164502164502164771e-04~3.317582809790099035e-02~8.166076347894530001e-04~1.424768037276413768e-01~5.451024886185313126e-03~8.451996708916544032e-02~4.316976201950852021e-02~0.000000000000000000e+00~1.976969238874000737e-03
2.321348485476769450e-03~9.648018231352957312e-05~7.306515072693670499e-04~1.131641502560471260e-02~6.221400854762891737e-03~7.924369286866983844e-03~1.008742830798851128e-03~4.658146137665673814e-02~1.619239587783604523e-03~1.098825656683653784e-02~7.318153714929054310e-03~7.102865985186213038e-02~1.108517979889251104e-01~1.658972408167568090e-04~4.932008529592712444e-03
3.545960572585004928e-03~9.929347539143124394e-05~1.752318626873985985e-03~1.472835500332144582e-02~1.075351308556328142e-02~6.877328323472120877e-03~9.429920033525271590e-04~1.018952421614638532e-01~2.023353785675198815e-03~1.268872841340197913e-02~1.099887390451496222e-02~6.510177347112848512e-02~3.307445156632749655e-02~7.470808531358101622e-04~6.543469901321514494e-03
3.787878787878787550e-03~0.000000000000000000e+00~0.000000000000000000e+00~4.614325068870522850e-03~2.272727272727273051e-03~0.000000000000000000e+00~0.000000000000000000e+00~3.037680273069883752e-01~0.000000000000000000e+00~9.449855699855699495e-03~1.574183392365210640e-02~8.207924756220211226e-02~2.646727010363374155e-02~6.313131313131312584e-04~3.968253968253968034e-03
This diff is collapsed. Click to expand it.
def transform_df_to_preprocessed_array(df):
df = drop_verb_forms(df)
df = drop_parts_of_speech(df)
df_numeric = df.drop(['number_of_samples', 'verb'], axis=1)
df_numeric = df_numeric.div(df['number_of_samples'], axis=0)
df[df_numeric.columns] = df_numeric.values
return df, df_numeric
def drop_rare_features(df):
df = df.drop(['b|vad', 'gu', 'neg ks', 'neg me', 'neg nud', 'neg o', 'neg vat', 'nuksin', 'tav', 'tud', 'neg gem', 'n|sin', 'tavat|vat', 'tama', 'me|sime', 'tav|v', 'ksite', 'neg ge', 'nud', 'nuks', 'v'], axis=1)
return df
def drop_verb_forms(df):
df = df.drop(['b', 'd', 'da', 'des', 'ks', 'ksid', 'ma', 'me', 's', 'sid', 'ta', 'vad', 'b|vad', 'ge', 'gem', 'gu', 'ksime', 'ksin', 'ksite', 'maks', 'mas', 'mast', 'mata', 'n', 'neg ge', 'neg ks', 'neg me', 'neg nud', 'neg o', 'neg vat', 'nud', 'nuks', 'nuksin', 'o', 'sime', 'sin', 'site', 'taks', 'takse', 'tav', 'te', 'ti', 'tud', 'v', 'vat', 'neg gem', 'n|sin', 'ma|tama', 'tavat|vat', 'tama', 'me|sime', 'tav|v'], axis=1)
return df
def drop_parts_of_speech(df):
df = df.drop(['A', 'H', 'N', 'O', 'P', 'S', 'U', 'Y'], axis=1)
return df
\ No newline at end of file
from estnltk import Text
from util import save_dict, load_dict, save_csv, read_csv
from k_means import plot_k_means
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
pd.options.display.float_format = '{:,.2f}'.format
def map_verbs_with_sentences():
verbs = {}
......@@ -84,40 +84,8 @@ def construct_df_of_verbs(initial_df):
save_csv(verbs_df, 'verbs.csv', sep='~', header=True)
print(verbs_df)
def transform_df_to_preprocessed_array(df): # divide by the number of samples
X = drop_redundant_features(df)
columns = X.columns
X = X.values
number_of_samples = df['number_of_samples'].values
number_of_samples = np.array([number_of_samples]*X.shape[1])
number_of_samples = np.swapaxes(number_of_samples, 0, 1)
X = X / number_of_samples
return X, columns
def drop_redundant_features(df):
df = df.drop(['verb', 'number_of_samples'], axis=1)
df = drop_verb_forms(df)
df = drop_parts_of_speech(df)
# remove rare features
# df = df.drop(['b|vad', 'gu', 'neg ks', 'neg me', 'neg nud', 'neg o', 'neg vat', 'nuksin', 'tav', 'tud', 'neg gem', 'n|sin', 'tavat|vat', 'tama', 'me|sime', 'tav|v', 'ksite', 'neg ge', 'nud', 'nuks', 'v'], axis=1)
print(df.columns)
return df
def drop_verb_forms(df):
df = df.drop(['b', 'd', 'da', 'des', 'ks', 'ksid', 'ma', 'me', 's', 'sid', 'ta', 'vad', 'b|vad', 'ge', 'gem', 'gu', 'ksime', 'ksin', 'ksite', 'maks', 'mas', 'mast', 'mata', 'n', 'neg ge', 'neg ks', 'neg me', 'neg nud', 'neg o', 'neg vat', 'nud', 'nuks', 'nuksin', 'o', 'sime', 'sin', 'site', 'taks', 'takse', 'tav', 'te', 'ti', 'tud', 'v', 'vat', 'neg gem', 'n|sin', 'ma|tama', 'tavat|vat', 'tama', 'me|sime', 'tav|v'], axis=1)
return df
def drop_parts_of_speech(df):
df = df.drop(['A', 'H', 'N', 'O', 'P', 'S', 'U', 'Y'], axis=1)
return df
df = read_csv('verbs.csv', sep='~', header=0)
X, columns = transform_df_to_preprocessed_array(df)
K = 5
plot_k_means(X, K, columns)
# df = read_csv('cleaned_dataframe.csv', sep='~')
# df.columns = ['distance', 'noun_like', 'noun_like_form', 'noun_like_pos', 'sentence', 'verb', 'verbs_form']
# construct_df_of_verbs(df)
\ No newline at end of file
if __name__ == '__main__':
df = read_csv('cleaned_dataframe.csv', sep='~')
df.columns = ['distance', 'noun_like', 'noun_like_form', 'noun_like_pos', 'sentence', 'verb', 'verbs_form']
construct_df_of_verbs(df)
\ No newline at end of file
from estnltk import Text, TextCleaner, ESTONIAN
import random
from util import load_dict, save_dict
def save_random_verbs():
ESTONIAN = ESTONIAN + '«»„ˮ“€’.…'
text_cleaner = TextCleaner(ESTONIAN)
verbs = load_dict('verbs_dict')
verbs_to_label = {}
for i in range(100):
random_verb = list(random.choice(list(verbs.items())))
verbs_to_label[random_verb[0]] = random.choice(random_verb[1])
print(verbs_to_label)
save_dict(verbs_to_label, 'verbs_with_labels')
def load_random_verbs():
verbs = load_dict('verbs_with_labels')
for verb in verbs:
sentence = verbs[verb]
print('%s: \'%s\'' % (verb, sentence))
sentence = Text(sentence).get.word_texts.lemmas.postags.as_dataframe
print(sentence)
load_random_verbs()
\ No newline at end of file
import numpy as np
import matplotlib.pyplot as plt
from plot_util import plot_costs
np.set_printoptions(threshold=np.nan)
def d(u, v): # squared difference
......@@ -14,7 +14,7 @@ def cost(X, R, M):
cost += (R[:,k] * sq_distances).sum()
return cost
def plot_k_means(X, K, columns, max_iter=20, beta=1.0, show_plots=True):
def plot_k_means(X, K, forms, verbs, max_iter=20, beta=1.0):
N, D = X.shape
M = np.zeros((K, D)) # means
R = np.zeros((N, K))
......@@ -43,18 +43,5 @@ def plot_k_means(X, K, columns, max_iter=20, beta=1.0, show_plots=True):
if np.abs(costs[i] - costs[i-1]) < 1e-5:
break
if show_plots:
plt.plot(costs)
plt.title("Costs")
plt.show()
random_colors = np.random.random((K, 3))
colors = R.dot(random_colors)
for i in range(X.shape[1]-1):
for j in range(i + 1, X.shape[1]-1):
plt.scatter(X[:,i], X[:,j], c=colors, s=7, alpha=0.9)
plt.xlabel(columns[i])
plt.ylabel(columns[j])
plt.show()
plot_costs(costs)
return M, R
\ No newline at end of file
import pandas as pd
import numpy as np
from util import read_csv, sort_dict
from util import load_dict
from k_means import plot_k_means, d
from df_util import transform_df_to_preprocessed_array, drop_rare_features, drop_verb_forms, drop_parts_of_speech
from plot_util import plot_features, plot_form_pdf, hist_verbs, plot_costs, hist_forms, pie_chart_clusters, pie_chart_verb
pd.set_option('display.expand_frame_repr', False)
K = 10
def get_verbs_data():
df = read_csv('verbs.csv', sep='~', header=0)
df, X = transform_df_to_preprocessed_array(df)
forms = X.columns
X = X.values
verbs = df['verb'].tolist()
return X, verbs, forms, df
def run_k_means():
X, verbs, forms, = get_verbs_data()
M, R = plot_k_means(X, K, forms, verbs)
np.savetxt('R.csv', R.astype(int), fmt='%d', delimiter='~')
np.savetxt('M.csv', M, delimiter='~')
def plot_results(X, R, forms, df):
plot_form_pdf(X, forms)
hist_forms(X, forms)
plot_features(K, R, X, forms)
pie_chart_clusters(X, R, K, forms)
for k in range(K):
print('Cluster', k)
print(df[R[:,k] == 1]['verb'])
def print_verb_info(verb, df, X, R, forms):
current_verb_index = df[df['verb'] == verb].index
print_cluster(current_verb_index, df, X, R)
pie_chart_verb(X[current_verb_index][0].tolist(), forms.tolist(), verb)
print_sample_usages(verb)
def print_cluster(current_verb_index, df, X, R):
R_verb = R[current_verb_index][0]
k = np.where(R_verb == 1)[0][0]
similar_verbs = df[R[:,k] == 1]['verb']
dd = []
for similar_verb in X[similar_verbs.index]:
dd.append(d(X[current_verb_index][0], similar_verb))
similar_verbs_dict = sort_dict(dict(zip(similar_verbs, dd)))
print(list(similar_verbs_dict.keys())[1:])
def print_sample_usages(verb):
df = read_csv('cleaned_dataframe.csv', sep='~')
df.columns = ['distance', 'noun_like', 'noun_like_form', 'noun_like_pos', 'sentence', 'verb', 'verbs_form']
sentences = df[df['verb'] == verb]['sentence'].tolist()
distances = df[df['verb'] == verb]['distance'].tolist()
similar_verbs_dict = {}
for i in range(len(sentences)):
if sentences[i] in similar_verbs_dict:
if similar_verbs_dict[sentences[i]] > distances[i]:
similar_verbs_dict[sentences[i]] = distances[i]
else:
similar_verbs_dict[sentences[i]] = distances[i]
similar_verbs_dict = sort_dict(similar_verbs_dict)
for sentence in similar_verbs_dict:
print(sentence)
R = np.genfromtxt('R.csv', delimiter='~')
M = np.genfromtxt('M.csv', delimiter='~')
X, verbs, forms, df = get_verbs_data()
print_verb_info('armastama', df, X, R, forms)
\ No newline at end of file
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
from util import sort_dict
def plot_features(K, R, X, forms):
random_colors = np.random.random((K, 3))
colors = R.dot(random_colors)
for i in range(X.shape[1]-1):
for j in range(i + 1, X.shape[1]-1):
plt.scatter(X[:,i], X[:,j], c=colors, s=7, alpha=0.9)
plt.xlabel(forms[i])
plt.ylabel(forms[j])
plt.show()
def plot_form_pdf(X, forms):
ax = None
labels = []
for i in range(X.shape[1]):
x = X[:,i]
if ax is not None:
pd.DataFrame(x).plot.density(bw_method=0.1, ax=ax)
else:
ax = pd.DataFrame(x).plot.density(bw_method=0.1)
labels.append('%s - mean: %.4f std: %.4f' % (forms[i], x.mean(), np.std(x)))
plt.xlim(-0.01, 0.015)
plt.legend(labels=labels)
plt.show()
def x_sum(x):
return x.sum()
def hist_verbs(X, forms, verbs):
X = X[:10,:]
# X = np.array(sorted(X, key=x_sum))
p = []
r = lambda: random.randint(0,255)
for i in range(X.shape[1]):
x = X[:,i]
print(x)
color = '#%02X%02X%02X' % (r(),r(),r())
if i == 0:
p.append(plt.bar(np.arange(len(x)), x, color=color, label=forms[i]))
else:
p.append(plt.bar(np.arange(len(x)), x, bottom=X[:,i-1], color=color, label=forms[i]))
plt.legend()
plt.show()
def plot_costs(costs):
plt.plot(costs)
plt.title("Costs")
plt.show()
def hist_forms(X, forms):
forms_dict = sort_dict(dict(zip(forms, X.sum(axis=0))))
plt.bar(np.arange(len(forms_dict)), list(forms_dict.values()), color='green')
plt.xticks(np.arange(len(forms_dict)), list(forms_dict.keys()), rotation='vertical')
plt.show()
def pie_chart_clusters(X, R, K, forms):
for k in range(K):
x = X[R[:,k] == 1].sum(axis=0)
plt.pie(x, labels=forms)
plt.title('Cluster %i' % k)
plt.axis('equal')
plt.show()
def pie_chart_verb(x, forms, verb):
forms_legend = forms.copy()
for i in range(len(x)):
x[i] = x[i]*100
if x[i] < 0.1:
forms[i] = ''
plt.pie(x, labels=forms)
plt.title(verb)
plt.axis('equal')
plt.legend(labels=forms_legend)
plt.show()
\ No newline at end of file
......@@ -47,4 +47,7 @@ def load_dict(name):
def save_dict(obj, name):
with open(name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
\ No newline at end of file
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def sort_dict(d):
return {k: v for k, v in sorted(d.items(), key=lambda x: x[1])}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment