added graphics

9fd65b3a · etcart · 9abb263f · 9fd65b3a · 9fd65b3a · 9fd65b3a
Commit 9fd65b3a authored Apr 14, 2018 by etcart
Showing with 255 additions and 8 deletions
3
DensityCLustering
DensityClustering
DensityClustering.c
DensityClustering.o
RIVLower.h
RIVLower.h.gch
RIVgraphout
RIVgraphout.c
RIVgraphout.o
RIVlexicon.h
RIVlexicon.h.gch
RIVtools.h
RIVtools.h.gch
graphdata.txt
test.py
--- a/3
+++ b/3
--- a/DensityCLustering
+++ b/DensityCLustering
--- a/DensityClustering
+++ b/DensityClustering
--- a/DensityClustering.c
+++ b/DensityClustering.c
+/* this DB scan algorithm is not meant to be an example of an easily written 
+ * program. rather it is a useful tool that can be used to validate the contents
+ * of a lexicon.  it will identify, using a density based algorithm
+ * clusters of vectors.  if the lexicon is well formed, these clusters should
+ * be numerous, as well as containing well related words */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <time.h>
+//RIVSIZE macro must be set to the size of the RIVs in the lexicon
+#define RIVSIZE 25000
+#define CACHESIZE 0
+#define EPSILON 0.98
+#define MINPOINTS 1
+#define UNCHECKED 0
+#define NOISE -1
+#define MINSIZE 10000
+
+
+#include "RIVtools.h"
+
+/* the node holds a vector, and metadata:
+ * -indexes will hold the array indexes of its neighbors
+ * -indexCount will hold the number of neighbors
+ * -status will hold its cluster, either a cluster number or "unchecked"
+ */
+struct DBnode{
+	sparseRIV RIV;
+	struct DBnode** neighbors;
+	int neighborCount;
+	int status;
+};
+
+void intercompare(struct DBnode* DBset, int nodeCount);
+void DBdive(struct DBnode* root, struct DBnode *DBset, int C);
+void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
+
+int main(int argc, char *argv[]){
+	if(argc <2){
+		printf("give me a directory");
+		return 1;
+	}
+	int fileCount = 0;
+	
+	sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
+	char rootString[1000];
+	
+	lexOpen(argv[1]);
+	strcpy(rootString, argv[1]);
+	strcat(rootString, "/");
+
+	directoryToL2s(rootString, &fileRIVs, &fileCount);
+	printf("fileCount: %d\n", fileCount);
+	/* an array of nodes, one for each vector */
+	struct DBnode DBset[fileCount];
+	
+	/* fill the node array with vectors and initialize metadata */
+	for(int i = 0; i < fileCount; i++){
+		fileRIVs[i].magnitude = getMagnitudeSparse(fileRIVs[i]);
+		DBset[i].RIV = fileRIVs[i];
+		/* a single malloc for later realloc'ing */
+		DBset[i].neighbors = malloc(sizeof(struct DBnode*));
+		DBset[i].neighborCount = 0;
+		DBset[i].status = UNCHECKED;
+		
+	}
+	/* fileRIVs was only temporary */
+	free(fileRIVs);
+
+	intercompare(DBset, fileCount);
+
+	
+	int C = 0;
+	
+	for(int i=0; i<fileCount; i++){
+		if(DBset[i].status) continue;
+		if(DBset[i].neighborCount <MINPOINTS){
+			DBset[i].status = NOISE;
+			continue;
+		}
+		C++;
+		printf("\ncluster %d\n", C);
+		DBset[i].status = C;
+		printf("root: %s, %d, %lf\n", DBset[i].RIV.name, DBset[i].RIV.frequency, DBset[i].RIV.magnitude);
+		DBdive(&DBset[i], DBset, C);
+	}
+
+
+return 0;
+}
+void DBdive(struct DBnode* root, struct DBnode *DBset, int C){
+
+	for(int i = 0; i < root->neighborCount; i++){
+		/* if this node is not already claimed by a cluster */
+		if(root->neighbors[i]->status > 0){
+			continue;
+		}
+		/* for easier coding, put it in a local variable */
+		struct DBnode *branch = root->neighbors[i];
+		
+		printf(">>%s, %d, %lf\n", branch->RIV.name, branch->RIV.frequency, branch->RIV.magnitude);
+		
+		/* include this in the cluster C */
+		branch->status = C;
+		/* if this branch has enough neighbors to spread */
+		if(branch->neighborCount > MINPOINTS){
+			/* recursive dive into next branch */
+			DBdive(branch, DBset, C);
+		
+		}
+		
+	}
+}
+/* fileRIVs and fileCount are accessed as pointers, so that we can find them changed outside this function
+ */
+void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
+	
+	DIR *directory;
+    struct dirent *files = 0;
+
+	if(!(directory = opendir(rootString))){
+		printf("location not found, %s\n", rootString);
+		return;
+	}
+
+	while((files=readdir(directory))){
+		if(*(files->d_name) == '.') continue;
+
+		if(files->d_type == DT_DIR){
+			/* the lexicon should not have valid sub-directories */
+			continue;
+		}
+		
+		denseRIV* temp = lexPull(files->d_name);
+		/* if the vector has been encountered more than MINSIZE times
+		 * then it should be statistically significant, and useful */
+		if(temp->contextSize >MINSIZE){
+			(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
+			(*fileRIVs)[(*fileCount)] = normalize(*temp, 500);
+			(*fileRIVs)[(*fileCount)].magnitude = temp->magnitude;
+			strcpy((*fileRIVs)[(*fileCount)].name, files->d_name);
+			(*fileCount)++;
+		}
+		free(temp);
+	}
+}
+
+void intercompare(struct DBnode* DBset, int nodeCount){
+	double cosine;
+	denseRIV baseDense;
+	for(int i=0; i<nodeCount; i++){
+		/* map the RIV in question to a dense for comparison */
+		memset(baseDense.values, 0, RIVSIZE*sizeof(int));
+		addS2D(baseDense.values, DBset[i].RIV);
+		baseDense.magnitude = DBset[i].RIV.magnitude;
+		/* for each previous vector */
+		for(int j=i+1; j<nodeCount; j++){
+				/* get cosine distance to that vector */
+				cosine = cosCompare(baseDense, DBset[j].RIV);
+
+			/* if this pair is close enough */
+			if(cosine>EPSILON){
+				
+				/* add the pairing to each node's list of neighbors */
+				DBset[i].neighbors = realloc(DBset[i].neighbors, (DBset[i].neighborCount+1)*sizeof(struct DBnode*));
+				DBset[j].neighbors = realloc(DBset[j].neighbors, (DBset[j].neighborCount+1)*sizeof(struct DBnode*));
+				
+				DBset[i].neighbors[DBset[i].neighborCount++] = &DBset[j];
+				DBset[j].neighbors[DBset[j].neighborCount++] = &DBset[i];
+			}
+		}
+	}
+}
--- a/DensityClustering.o
+++ b/DensityClustering.o
--- a/RIVLower.h
+++ b/RIVLower.h
@@ -58,7 +58,7 @@ typedef struct{
 	int *values;
 	int *locations;
 	size_t count;
-	double magnitude;
+	float magnitude;
 	int contextSize;
 	int frequency;
 }sparseRIV;
@@ -71,7 +71,7 @@ typedef struct{
 	int cached;
 	char name[100];
 	int frequency;
-	double magnitude;
+	float magnitude;
 	int contextSize;
 	int values[RIVSIZE];
 }denseRIV;

--- a/RIVLower.h.gch
+++ b/RIVLower.h.gch
--- a/RIVgraphout
+++ b/RIVgraphout
--- a/RIVgraphout.c
+++ b/RIVgraphout.c
+#include <stdio.h>
+#define RIVSIZE 25000
+#define CACHESIZE 0
+#include "RIVtools.h"
+#include <dirent.h>
+
+int main(int argc, char* argv[]){
+	lexOpen(argv[1]);
+	denseRIV* intake;
+	sparseRIV examine;
+	static denseRIV *output[60000] = {0};
+	DIR *directory;
+    struct dirent *files = 0;
+
+	if(!(directory = opendir(argv[1]))){
+		printf("location not found, %s\n", argv[1]);
+		return 1;
+	}
+	int i=0;
+	while((files=readdir(directory))){
+		if(*(files->d_name) == '.') continue;
+
+		if(files->d_type == DT_DIR){
+			/* the lexicon should not have valid sub-directories */
+			continue;
+		}
+		
+		intake = lexPull(files->d_name);
+		/* if the vector has been encountered more than MINSIZE times
+		 * then it should be statistically significant, and useful */
+		if(intake->contextSize<10000)continue;
+		examine = normalize(*intake, 500);
+		strcpy(examine.name, files->d_name);
+		printf("%d,%d,%lf,%s\n", examine.frequency, examine.contextSize, examine.magnitude, examine.name);
+		output[i] = calloc(1, sizeof(denseRIV));
+		addS2D(output[i]->values, examine);
+		output[i]->magnitude = examine.magnitude;
+		strcpy(output[i]->name, files->d_name);
+		output[i]->frequency = intake->frequency;
+		free(intake);
+		free(examine.locations);
+		i++;
+	}
+	lexClose();
+	/*lexOpen("consolidatedLexiconAggressive");
+	for(int j=0; j<i; j++){
+		
+		lexPush(output[j]);
+		
+		
+	}
+	lexClose();*/
+	return 0;
+}
--- a/RIVgraphout.o
+++ b/RIVgraphout.o
--- a/RIVlexicon.h
+++ b/RIVlexicon.h
@@ -192,7 +192,7 @@ int fLexPush(denseRIV* output){

 		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
 		fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
-		fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
+		fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
 		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
 		fwrite(temp.locations, temp.count, sizeof(int), lexWord);
 		fwrite(temp.values, temp.count, sizeof(int), lexWord);
@@ -202,7 +202,7 @@ int fLexPush(denseRIV* output){
 		temp.count = 0;
 		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
 		fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
-		fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
+		fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
 		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
 		fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
 	}
@@ -220,7 +220,7 @@ denseRIV* fLexPull(FILE* lexWord){
 	/* get metadata for vector */
 	fread(&typeCheck, 1, sizeof(size_t), lexWord);
 	fread(&output->frequency, 1, sizeof(int), lexWord);
-	fread(&output->contextSize, 1, sizeof(int), lexWord);
+	fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
 	fread(&output->magnitude, 1, sizeof(float), lexWord);

 	/* first value stored is the value count if sparse, and 0 if dense */
@@ -269,8 +269,6 @@ int cacheDump(){
 void signalSecure(int signum, siginfo_t *si, void* arg){
  if(cacheDump()){
 	  puts("cache dump failed, some lexicon data lost");
-  }else{
-	puts("cache dumped successfully");
  }
  signal(signum, SIG_DFL);
  kill(getpid(), signum);

--- a/RIVlexicon.h.gch
+++ b/RIVlexicon.h.gch
--- a/RIVtools.h
+++ b/RIVtools.h
@@ -228,7 +228,7 @@ sparseRIV normalize(denseRIV input, int factor){
 		values[count]= round(input.values[i]*multiplier);
 		
 		/* drop any 0 values */
-		if(values[count] > 1)count++; 
+		if(values[count])count++; 
 	}
 	sparseRIV output;
 	output.count = count;

--- a/RIVtools.h.gch
+++ b/RIVtools.h.gch
--- a/graphdata.txt
+++ b/graphdata.txt
--- a/test.py
+++ b/test.py
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+
+data = open("../code/RIVet/graphdata.txt", "r");
+frequencies = [];
+mags = [];
+i = 0;
+for line in data:
+    if(int(line.split(",")[1])>40000):
+         continue;
+    frequencies.append(int(line.split(",")[1]))
+    mags.append(float(line.split(",")[2]))
+    if(mags[i]>80 and frequencies[i]>7000 and frequencies[i]<15000):
+        print(line)
+    i+=1
+
+
+plt.scatter(frequencies, mags)
+plt.show()