Commit 9fd65b3a by etcart

added graphics

parent 9abb263f
File added
/* this DB scan algorithm is not meant to be an example of an easily written
* program. rather it is a useful tool that can be used to validate the contents
* of a lexicon. it will identify, using a density based algorithm
* clusters of vectors. if the lexicon is well formed, these clusters should
* be numerous, as well as containing well related words */
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
//RIVSIZE macro must be set to the size of the RIVs in the lexicon
#define RIVSIZE 25000
#define CACHESIZE 0
#define EPSILON 0.98
#define MINPOINTS 1
#define UNCHECKED 0
#define NOISE -1
#define MINSIZE 10000
#include "RIVtools.h"
/* the node holds a vector, and metadata:
* -indexes will hold the array indexes of its neighbors
* -indexCount will hold the number of neighbors
* -status will hold its cluster, either a cluster number or "unchecked"
*/
struct DBnode{
sparseRIV RIV;
struct DBnode** neighbors;
int neighborCount;
int status;
};
void intercompare(struct DBnode* DBset, int nodeCount);
void DBdive(struct DBnode* root, struct DBnode *DBset, int C);
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
if(argc <2){
printf("give me a directory");
return 1;
}
int fileCount = 0;
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[1000];
lexOpen(argv[1]);
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
/* an array of nodes, one for each vector */
struct DBnode DBset[fileCount];
/* fill the node array with vectors and initialize metadata */
for(int i = 0; i < fileCount; i++){
fileRIVs[i].magnitude = getMagnitudeSparse(fileRIVs[i]);
DBset[i].RIV = fileRIVs[i];
/* a single malloc for later realloc'ing */
DBset[i].neighbors = malloc(sizeof(struct DBnode*));
DBset[i].neighborCount = 0;
DBset[i].status = UNCHECKED;
}
/* fileRIVs was only temporary */
free(fileRIVs);
intercompare(DBset, fileCount);
int C = 0;
for(int i=0; i<fileCount; i++){
if(DBset[i].status) continue;
if(DBset[i].neighborCount <MINPOINTS){
DBset[i].status = NOISE;
continue;
}
C++;
printf("\ncluster %d\n", C);
DBset[i].status = C;
printf("root: %s, %d, %lf\n", DBset[i].RIV.name, DBset[i].RIV.frequency, DBset[i].RIV.magnitude);
DBdive(&DBset[i], DBset, C);
}
return 0;
}
void DBdive(struct DBnode* root, struct DBnode *DBset, int C){
for(int i = 0; i < root->neighborCount; i++){
/* if this node is not already claimed by a cluster */
if(root->neighbors[i]->status > 0){
continue;
}
/* for easier coding, put it in a local variable */
struct DBnode *branch = root->neighbors[i];
printf(">>%s, %d, %lf\n", branch->RIV.name, branch->RIV.frequency, branch->RIV.magnitude);
/* include this in the cluster C */
branch->status = C;
/* if this branch has enough neighbors to spread */
if(branch->neighborCount > MINPOINTS){
/* recursive dive into next branch */
DBdive(branch, DBset, C);
}
}
}
/* fileRIVs and fileCount are accessed as pointers, so that we can find them changed outside this function
*/
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
/* the lexicon should not have valid sub-directories */
continue;
}
denseRIV* temp = lexPull(files->d_name);
/* if the vector has been encountered more than MINSIZE times
* then it should be statistically significant, and useful */
if(temp->contextSize >MINSIZE){
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = normalize(*temp, 500);
(*fileRIVs)[(*fileCount)].magnitude = temp->magnitude;
strcpy((*fileRIVs)[(*fileCount)].name, files->d_name);
(*fileCount)++;
}
free(temp);
}
}
void intercompare(struct DBnode* DBset, int nodeCount){
double cosine;
denseRIV baseDense;
for(int i=0; i<nodeCount; i++){
/* map the RIV in question to a dense for comparison */
memset(baseDense.values, 0, RIVSIZE*sizeof(int));
addS2D(baseDense.values, DBset[i].RIV);
baseDense.magnitude = DBset[i].RIV.magnitude;
/* for each previous vector */
for(int j=i+1; j<nodeCount; j++){
/* get cosine distance to that vector */
cosine = cosCompare(baseDense, DBset[j].RIV);
/* if this pair is close enough */
if(cosine>EPSILON){
/* add the pairing to each node's list of neighbors */
DBset[i].neighbors = realloc(DBset[i].neighbors, (DBset[i].neighborCount+1)*sizeof(struct DBnode*));
DBset[j].neighbors = realloc(DBset[j].neighbors, (DBset[j].neighborCount+1)*sizeof(struct DBnode*));
DBset[i].neighbors[DBset[i].neighborCount++] = &DBset[j];
DBset[j].neighbors[DBset[j].neighborCount++] = &DBset[i];
}
}
}
}
...@@ -58,7 +58,7 @@ typedef struct{ ...@@ -58,7 +58,7 @@ typedef struct{
int *values; int *values;
int *locations; int *locations;
size_t count; size_t count;
double magnitude; float magnitude;
int contextSize; int contextSize;
int frequency; int frequency;
}sparseRIV; }sparseRIV;
...@@ -71,7 +71,7 @@ typedef struct{ ...@@ -71,7 +71,7 @@ typedef struct{
int cached; int cached;
char name[100]; char name[100];
int frequency; int frequency;
double magnitude; float magnitude;
int contextSize; int contextSize;
int values[RIVSIZE]; int values[RIVSIZE];
}denseRIV; }denseRIV;
......
File added
File added
#include <stdio.h>
#define RIVSIZE 25000
#define CACHESIZE 0
#include "RIVtools.h"
#include <dirent.h>
int main(int argc, char* argv[]){
lexOpen(argv[1]);
denseRIV* intake;
sparseRIV examine;
static denseRIV *output[60000] = {0};
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(argv[1]))){
printf("location not found, %s\n", argv[1]);
return 1;
}
int i=0;
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
/* the lexicon should not have valid sub-directories */
continue;
}
intake = lexPull(files->d_name);
/* if the vector has been encountered more than MINSIZE times
* then it should be statistically significant, and useful */
if(intake->contextSize<10000)continue;
examine = normalize(*intake, 500);
strcpy(examine.name, files->d_name);
printf("%d,%d,%lf,%s\n", examine.frequency, examine.contextSize, examine.magnitude, examine.name);
output[i] = calloc(1, sizeof(denseRIV));
addS2D(output[i]->values, examine);
output[i]->magnitude = examine.magnitude;
strcpy(output[i]->name, files->d_name);
output[i]->frequency = intake->frequency;
free(intake);
free(examine.locations);
i++;
}
lexClose();
/*lexOpen("consolidatedLexiconAggressive");
for(int j=0; j<i; j++){
lexPush(output[j]);
}
lexClose();*/
return 0;
}
File added
...@@ -192,7 +192,7 @@ int fLexPush(denseRIV* output){ ...@@ -192,7 +192,7 @@ int fLexPush(denseRIV* output){
fwrite(&temp.count, 1, sizeof(size_t), lexWord); fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(&RIVout.frequency, 1, sizeof(int), lexWord); fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord); fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord); fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord); fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord); fwrite(temp.values, temp.count, sizeof(int), lexWord);
...@@ -202,7 +202,7 @@ int fLexPush(denseRIV* output){ ...@@ -202,7 +202,7 @@ int fLexPush(denseRIV* output){
temp.count = 0; temp.count = 0;
fwrite(&temp.count, 1, sizeof(size_t), lexWord); fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(&RIVout.frequency, 1, sizeof(int), lexWord); fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord); fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord); fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord); fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
} }
...@@ -220,7 +220,7 @@ denseRIV* fLexPull(FILE* lexWord){ ...@@ -220,7 +220,7 @@ denseRIV* fLexPull(FILE* lexWord){
/* get metadata for vector */ /* get metadata for vector */
fread(&typeCheck, 1, sizeof(size_t), lexWord); fread(&typeCheck, 1, sizeof(size_t), lexWord);
fread(&output->frequency, 1, sizeof(int), lexWord); fread(&output->frequency, 1, sizeof(int), lexWord);
fread(&output->contextSize, 1, sizeof(int), lexWord); fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
fread(&output->magnitude, 1, sizeof(float), lexWord); fread(&output->magnitude, 1, sizeof(float), lexWord);
/* first value stored is the value count if sparse, and 0 if dense */ /* first value stored is the value count if sparse, and 0 if dense */
...@@ -269,8 +269,6 @@ int cacheDump(){ ...@@ -269,8 +269,6 @@ int cacheDump(){
void signalSecure(int signum, siginfo_t *si, void* arg){ void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){ if(cacheDump()){
puts("cache dump failed, some lexicon data lost"); puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
} }
signal(signum, SIG_DFL); signal(signum, SIG_DFL);
kill(getpid(), signum); kill(getpid(), signum);
......
...@@ -228,7 +228,7 @@ sparseRIV normalize(denseRIV input, int factor){ ...@@ -228,7 +228,7 @@ sparseRIV normalize(denseRIV input, int factor){
values[count]= round(input.values[i]*multiplier); values[count]= round(input.values[i]*multiplier);
/* drop any 0 values */ /* drop any 0 values */
if(values[count] > 1)count++; if(values[count])count++;
} }
sparseRIV output; sparseRIV output;
output.count = count; output.count = count;
......
No preview for this file type
This source diff could not be displayed because it is too large. You can view the blob instead.
import numpy as np
import matplotlib.pyplot as plt
data = open("../code/RIVet/graphdata.txt", "r");
frequencies = [];
mags = [];
i = 0;
for line in data:
if(int(line.split(",")[1])>40000):
continue;
frequencies.append(int(line.split(",")[1]))
mags.append(float(line.split(",")[2]))
if(mags[i]>80 and frequencies[i]>7000 and frequencies[i]<15000):
print(line)
i+=1
plt.scatter(frequencies, mags)
plt.show()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment