Commit f2876a83 by simetk

shuffled libraries

parent 3179d5fd
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 5000
#define CACHESIZE 0
#define NONZEROS 2
#define EPSILON 0.8
#define MINPOINTS 15
#define UNCHECKED 0
#define NOISE -1
#include "RIVtoolsCPUlinux.h"
struct DBnode{
sparseRIV RIV;
int* indexes;
int indexCount;
int status;
}*DBset;
void DBdive(int C, int i);
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
RIVInit();
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
sparseRIV* fileRIVs_slider = fileRIVs;
sparseRIV* fileRIVs_stop = fileRIVs+fileCount;
DBset = malloc(fileCount*sizeof(struct DBnode));
struct DBnode* DBset_slider = DBset;
while(fileRIVs_slider <fileRIVs_stop){
(*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider);
(*DBset_slider).RIV = *fileRIVs_slider;
(*DBset_slider).indexes = malloc(sizeof(int));
(*DBset_slider).indexCount = 0;
(*DBset_slider++).status = 0;
fileRIVs_slider++;
}
free(fileRIVs);
clock_t beginnsquared = clock();
float cosine;
denseRIV baseDense;
baseDense.values = malloc(RIVSIZE*sizeof(int));
for(int i=0; i<fileCount; i++){
memset(baseDense.values, 0, RIVSIZE*sizeof(int));
baseDense.values = addS2D(baseDense.values, DBset[i].RIV);
baseDense.magnitude = DBset[i].RIV.magnitude;
for(int j=i+1; j<fileCount; j++){
cosine = cosCompare(baseDense, DBset[j].RIV);
if(cosine>EPSILON){
DBset[i].indexes = realloc(DBset[i].indexes, (DBset[i].indexCount+1)*sizeof(int));
DBset[i].indexes[DBset[i].indexCount++] = j;
DBset[j].indexes = realloc(DBset[j].indexes, (DBset[j].indexCount+1)*sizeof(int));
DBset[j].indexes[DBset[j].indexCount++] = i;
}
}
}
int C = 0;
printf("got here\n");
for(int i=0; i<fileCount; i++){
if(DBset[i].status) continue;
if(DBset[i].indexCount <MINPOINTS){
DBset[i].status = NOISE;
}
C++;
DBset[i].status = C;
DBdive(C, i);
}
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("\nnsquared time:%lf\n\n", time);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
return 0;
}
void DBdive(int C, int i){
printf("starting at: %s\n", DBset[i].RIV.name);
struct DBnode *DBnet = malloc(sizeof(struct DBnode));
DBnet[0] = DBset[i];
int nodeCount = 1;
for(int j=0; j<nodeCount; j++){
for(int k=0; k<DBnet[j].indexCount;k++){
int index = DBnet[j].indexes[k];
if(DBset[index].status) continue;
DBset[index].status = C;
if(DBset[index].indexCount> MINPOINTS){
DBnet = realloc(DBnet, (nodeCount+1)*sizeof(struct DBnode));
printf("diving into: %s\n", DBset[index].RIV.name);
DBnet[nodeCount++] = DBset[index];
}
}
}
free(DBnet);
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input);
(*fileCount)++;
}
}
}
/* RIV stands for Random Index Vector, referring to the method of generating
* the basic vectors that correspond to each word. each word has an algorithmically
* generated vector which represents it in this mathematical model, such that a word
* will produce the same vector each time it is encountered*[1]. this base
* vector will be referred to as a L1 vector or a barcode vector
*
* by summing these vectors, we can get a mathematical representation of
* a set of text. this summed vector will be referred to as an L2 vector
* or aggregate vector. in its simplest implimentation, an L2 vector
* representation of a document contains a model of the contents of the
* document, enabling us to compare direction and magnitude of document
* vectors to understand their relationships to each other.
*
* but the system we are really interested in is the ability to form
* context vectors
* a context vector is the sum of all (L1?) vectors that the word
* has been encountered in context with. from these context vectors
* certain patterns and relationships between words should emerge.
* what patterns? that is the key question we will try to answer
*
* [1] a word produces the same vector each time it is encountered only
* if the environment is the same, ie. RIVs are the same dimensionality
* nonzero count is the same. comparing vectors produced in different
* environments yields meaningless drivel and should be avoided
*
* [2] what exactly "context" means remains a major stumbling point.
* paragraphs? sentences? some potential analyses would expect a static
* sized context (the nearest 10 words?) in order to be sensible, but
* it may be that some other definition of context is the most valid for
* this model. we will have to find out.
*
* some notes:
*
* -sparseRIV vs. denseRIV (sparse vector vs. dense vector)
* the two primary data structures we will use to analyze these vectors
* each vector type is packed with some metadata
* (name, magnitude, frequency, flags)
*
* -denseRIV is a standard vector representation.
* each array index corresponds to a dimension
* each value corresponds to a measurement in that dimension
*
* -sparseRIV is vector representation optimized for largely empty vectors
* each data point is a location/value pair where the
* location represents array index
* value represents value in that array index
*
* if we have a sparsely populated dense vector (mostly 0s) such as:
*
* |0|0|5|0|0|0|0|0|4|0|
*
* there are only 2 values in a ten element array. this could, instead
* be represented as
*
* |2|8| array indexes
* |5|4| array values
* |2| record of size
*
* and so, a 10 element vector has been represented in only 5 integers
*
* this is important for memory use, of course, but also for rapid calculations
* if we have two vectors
*
* |0|0|5|0|0|0|0|0|4|0|
* |0|0|0|0|0|0|7|0|3|-2|
* and we wish to perform the dot product this will take 10 steps,
* 9 of which are either 0*0 = 0, or 0*x = 0
* if we instead have these represented as sparse vectors
* |2|8|
* |5|4|
* |2|
*
* |6|8|9|
* |7|3|-2|
* |3|
*
* we only need to search for matching location values
* or, better yet, if we use a hybrid analysis:
* |0|0|5|0|0|0|0|0|4|0|
* ___________/__/_/
* / / /
* |6|8|9|
* |7|3|-2|
* |3|
* we can simply access the dense vector by indexes held in the sparse vector
* reducing this operation to only 3 steps
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <math.h>
#include "RIVLower.h"
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp);
free(denseTemp);
return sparseOut;
}
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
sparseRIV sparseOut;
int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
sparseOut.count = 0;
int add = 1;
int found;
for(int i=0; i<valueCount; i++){
found = 0;
for(int j=0; j<sparseOut.count; j++){
if(implicit[i] == locationsTemp[j]){
valuesTemp[i] += add;
add *= -1;
found = 1;
}
}
if(!found){
locationsTemp[sparseOut.count] = implicit[i];
valuesTemp[sparseOut.count] = add;
sparseOut.count++;
add*= -1;
}
}
sparseOut.locations = malloc(2*sparseOut.count*sizeof(int));
sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, sparseOut.count*sizeof(int));
memcpy(sparseOut.values, valuesTemp, sparseOut.count*sizeof(int));
return sparseOut;
}
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
return output;
}
void RIVInit(){
RIVKey.I2SThreshold = sqrt(RIVSIZE);
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
struct sigaction action;
action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO;
//for(int i=1; i<27; i++){
sigaction(11,&action,NULL);
//}
RIVKey.h_tempBlock = (int*)malloc(3*RIVSIZE*sizeof(int));
RIVKey.tempSize = 3*RIVSIZE;
RIVKey.thing = 0;
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE);
}
void RIVCleanup(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
free(RIVKey.h_tempBlock);
}
int wordtoSeed(unsigned char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words
* this means letters are taken as characters couned in base 32, which
* should be large enough to hold all english characters plus a few outliers
* */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE;
locations++;
*locations = rand()%RIVSIZE;
locations++;
}
return;
}
int fLexPush(denseRIV RIVout){
char pathString[200] = {0};
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "lexicon/%s", RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
if(temp.count<(RIVSIZE/2)){
/* smaller stored as sparse vector */
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
}else{
/* saturation is too high, better to store dense */
/* there's gotta be a better way to do this */
temp.count = 0;
fwrite(&temp.count, 1, sizeof(int), lexWord);
fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
}
fclose(lexWord);
free(RIVout.values);
free(temp.locations);
return 0;
}
denseRIV fLexPull(FILE* lexWord){
denseRIV output = denseAllocate();
int typeCheck;
/* get metadata for vector */
fread(&typeCheck, 1, sizeof(unsigned int), lexWord);
fread(output.frequency, 1, sizeof(int), lexWord);
fread(&(output.magnitude), 1, sizeof(int), lexWord);
/* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){
/* pull as sparseVector */
sparseRIV temp;
/* value was not 0, so it's the value count */
temp.count = typeCheck;
temp.locations = malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
fread(temp.locations, temp.count, sizeof(int), lexWord);
fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output.values, temp);
free(temp.locations);
}else{
/* typecheck is thrown away, just a flag in this case */
fread(output.values, RIVSIZE, sizeof(int), lexWord);
}
output.cached = 0;
return output;
}
void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
int cacheDump(){
int i=0;
int j=0;
int flag = 0;
denseRIV* cache_slider = RIVKey.RIVCache;
denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE;
while(cache_slider<cache_stop){
if((*cache_slider).cached){
j++;
flag += fLexPush(*cache_slider);
}
else{
i++;
}
cache_slider++;
}
printf("%d cacheslots unused\n%d, cacheslots used", i, j);
return flag;
}
denseRIV denseAllocate(){
/* allocates a 0 vector */
denseRIV output;
output.values = calloc(RIVSIZE+1, sizeof(int));
/* for compact memory use, frequency is placed immediately after values */
output.frequency = output.values+RIVSIZE;
output.magnitude = 0;
output.cached = 0;
return output;
}
/*TODO add a simplified free function*/
#ifndef RIVLOWER_H_
#define RIVLOWER_H_
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
...@@ -53,7 +55,7 @@ typedef struct{ ...@@ -53,7 +55,7 @@ typedef struct{
int *values; int *values;
int *locations; int *locations;
size_t count; size_t count;
unsigned int frequency; int frequency;
float magnitude; float magnitude;
int boolean; int boolean;
}sparseRIV; }sparseRIV;
...@@ -113,7 +115,7 @@ sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion ...@@ -113,7 +115,7 @@ sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
* this produces an "implicit" RIV which can be used with the mapI2D function * this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV. * to create a denseRIV.
*/ */
void makesparseLocations(unsigned char* word, int *seeds, size_t seedCount); void makeSparseLocations(unsigned char* word, int *seeds, size_t seedCount);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file, /* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexpush", * saving it for long-term aggregation. function is called by "lexpush",
...@@ -133,6 +135,7 @@ int wordtoSeed(unsigned char* word); ...@@ -133,6 +135,7 @@ int wordtoSeed(unsigned char* word);
*/ */
int* mapI2D(int *locations, size_t seedCount); int* mapI2D(int *locations, size_t seedCount);
int* addS2D(int* destination, sparseRIV input);
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount); sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount); sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount);
...@@ -143,274 +146,4 @@ void signalSecure(int signum, siginfo_t *si, void* arg); ...@@ -143,274 +146,4 @@ void signalSecure(int signum, siginfo_t *si, void* arg);
/* begin definitions */ /* begin definitions */
#endif
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp);
free(denseTemp);
return sparseOut;
}
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
sparseRIV sparseOut;
int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
sparseOut.count = 0;
int add = 1;
int found;
for(int i=0; i<valueCount; i++){
found = 0;
for(int j=0; j<sparseOut.count; j++){
if(implicit[i] == locationsTemp[j]){
valuesTemp[i] += add;
add *= -1;
found = 1;
}
}
if(!found){
locationsTemp[sparseOut.count] = implicit[i];
valuesTemp[sparseOut.count] = add;
sparseOut.count++;
add*= -1;
}
}
sparseOut.locations = malloc(2*sparseOut.count*sizeof(int));
sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, 2*sparseOut.count*sizeof(int));
return sparseOut;
}
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
return output;
}
void RIVInit(){
RIVKey.I2SThreshold = sqrt(RIVSIZE);
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
struct sigaction action;
action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO;
//for(int i=1; i<27; i++){
sigaction(11,&action,NULL);
//}
RIVKey.h_tempBlock = (int*)malloc(3*RIVSIZE*sizeof(int));
RIVKey.tempSize = 3*RIVSIZE;
RIVKey.thing = 0;
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE);
}
void RIVCleanup(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
free(RIVKey.h_tempBlock);
}
int wordtoSeed(unsigned char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE;
locations++;
*locations = rand()%RIVSIZE;
locations++;
}
return;
}
int fLexPush(denseRIV RIVout){
char pathString[200] = {0};
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "lexicon/%s", RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(float), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
fclose(lexWord);
free(RIVout.values);
free(temp.locations);
return 0;
}
denseRIV fLexPull(FILE* lexWord){
denseRIV output;
sparseRIV temp;
output.values = calloc( (RIVSIZE+1) ,sizeof(int));
output.frequency = output.values+RIVSIZE;
int diagnostic = 0;
fread(&temp.count, 1, sizeof(size_t), lexWord);
diagnostic += fread(&temp.frequency, 1, sizeof(int), lexWord);
diagnostic += fread(&(temp.magnitude), 1, sizeof(int), lexWord);
temp.locations = malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
diagnostic += fread(temp.locations, temp.count, sizeof(int), lexWord);
diagnostic += fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output.values, temp);
*(output.frequency) = temp.frequency;
output.magnitude = temp.magnitude;
free(temp.locations);
output.cached = 0;
return output;
}
void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
int cacheDump(){
int flag = 0;
denseRIV* cache_slider = RIVKey.RIVCache;
denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE;
while(cache_slider<cache_stop){
if((*cache_slider).cached){
flag += fLexPush(*cache_slider);
}
cache_slider++;
}
return flag;
}
denseRIV denseAllocate(){
/* allocates a 0 vector */
denseRIV output;
output.values = calloc(RIVSIZE+1, sizeof(int));
/* for compact memory use, frequency is placed immediately after values */
output.frequency = output.values+RIVSIZE;
output.magnitude = 0;
output.cached = 0;
return output;
}
/*TODO add a simplified free function*/
int isLetter(char c){
if((c>96 && c<123)||(c == 32) || (c == '_')) return 1;
else return 0;
}
int isWordClean(char* word){
char *letter = word;
char *word_stop = word+99;
while(letter<word_stop){
if(!(*letter)) break;
if(!(isLetter(*letter))){
return 0;
}
letter++;
}
return 1;
}
#ifndef RIVACCESS_H_
#define RIVACCESS_H_
/*isWordClean filters words that contain non-letter characters, and /*isWordClean filters words that contain non-letter characters, and
* upperCase letters, allowing only the '_' symbol through * upperCase letters, allowing only the '_' symbol through
*/ */
...@@ -5,24 +7,4 @@ int isWordClean(char* word); ...@@ -5,24 +7,4 @@ int isWordClean(char* word);
/* used by wordClean */ /* used by wordClean */
int isLetter(char c); int isLetter(char c);
#endif
int isLetter(char c){
if((c>96 && c<123)||(c == 32) || (c == '_')) return 1;
else return 0;
}
int isWordClean(char* word){
char *letter = word;
char *word_stop = word+99;
while(letter<word_stop){
if(!(*letter)) break;
if(!(isLetter(*letter))){
return 0;
}
letter++;
}
return 1;
}
...@@ -3,18 +3,11 @@ ...@@ -3,18 +3,11 @@
#include <dirent.h> #include <dirent.h>
#include <time.h> #include <time.h>
#define RIVSIZE 5000 #define RIVSIZE 25000
#define CACHESIZE 0 #define CACHESIZE 0
#define NONZEROS 2 #define NONZEROS 2
#define THRESHOLD 0.7 #define THRESHOLD 0.70
#define COSINEACTION do {\ #include "RIVtools.h"
if(cosine > THRESHOLD){ \
printf("%s\t%s\n%f\n", baseRIV.name, (*multipliers).name, cosine);\
(*multipliers).boolean = 0; \
RIVKey.thing++; \
}\
}while(0)
#include "RIVtoolsMorphic.h"
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount); void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
...@@ -51,9 +44,9 @@ int main(int argc, char *argv[]){ ...@@ -51,9 +44,9 @@ int main(int argc, char *argv[]){
baseDense.values = malloc(RIVSIZE*sizeof(int)); baseDense.values = malloc(RIVSIZE*sizeof(int));
fileRIVs_slider = fileRIVs; fileRIVs_slider = fileRIVs;
sparseRIV* comparators_slider; sparseRIV* comparators_slider;
int count = 0;
while(fileRIVs_slider<fileRIVs_stop){ while(fileRIVs_slider<fileRIVs_stop){
comparators_slider = fileRIVs; comparators_slider = fileRIVs;
memset(baseDense.values, 0, RIVSIZE*sizeof(int)); memset(baseDense.values, 0, RIVSIZE*sizeof(int));
baseDense.values = addS2D(baseDense.values, *fileRIVs_slider); baseDense.values = addS2D(baseDense.values, *fileRIVs_slider);
baseDense.magnitude = (*fileRIVs_slider).magnitude; baseDense.magnitude = (*fileRIVs_slider).magnitude;
...@@ -63,15 +56,15 @@ int main(int argc, char *argv[]){ ...@@ -63,15 +56,15 @@ int main(int argc, char *argv[]){
if((*comparators_slider).magnitude < maxmag && (*comparators_slider).magnitude > minmag && (*comparators_slider).boolean){ if((*comparators_slider).magnitude < maxmag && (*comparators_slider).magnitude > minmag && (*comparators_slider).boolean){
cosine = cosCompare(baseDense, *comparators_slider); cosine = cosCompare(baseDense, *comparators_slider);
count++;
if(cosine>THRESHOLD){ if(cosine>THRESHOLD){
printf("%s\t%s\n%f\n", (*fileRIVs_slider).name , (*comparators_slider).name, cosine); printf("%s\t%s\n%f\n", (*fileRIVs_slider).name , (*comparators_slider).name, cosine);
(*comparators_slider).boolean = 0; (*comparators_slider).boolean = 0;
RIVKey.thing++; RIVKey.thing++;
} }
} }
comparators_slider++; comparators_slider++;
//cosineCompare(fileRIVs[i], fileRIVs, i);
} }
...@@ -80,8 +73,9 @@ int main(int argc, char *argv[]){ ...@@ -80,8 +73,9 @@ int main(int argc, char *argv[]){
} }
clock_t endnsquared = clock(); clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC; double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("nsquared time:%lf\n\n", time); printf("\nnsquared time:%lf\n\n", time);
printf("%d <", RIVKey.thing); printf("\ncosines: %d \n", count);
printf("\nsims: %d \n", RIVKey.thing);
clock_t endtotal = clock(); clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC; double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent); printf("total time:%lf\n\n", time_spent);
......
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h> #include <time.h>
#define CACHESIZE 100000 #define CACHESIZE 10000
//#define RIVSIZE 5
#include <setjmp.h>
#include <signal.h>
#include "RIVtoolsCPUlinux.h" #include "RIVtoolsCPUlinux.h"
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
...@@ -13,8 +16,9 @@ void fileGrind(FILE* textFile); ...@@ -13,8 +16,9 @@ void fileGrind(FILE* textFile);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount); void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount); int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void directoryGrind(char *rootString); void directoryGrind(char *rootString);
void readdirContingency(int sigNumber);
jmp_buf readdirRecov;
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
clock_t begintotal = clock(); clock_t begintotal = clock();
RIVInit(); RIVInit();
...@@ -31,22 +35,17 @@ int main(int argc, char *argv[]){ ...@@ -31,22 +35,17 @@ int main(int argc, char *argv[]){
} }
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){ void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
denseRIV *denseSet_slider; denseRIV *denseSet_slider = denseSet;
denseRIV *dense_stop = denseSet+RIVCount; denseRIV *dense_stop = denseSet+RIVCount;
int *locations = additive.locations;
int *locations_stop = locations+additive.count;
int *values = additive.values;
//int *target; //int *target;
while(locations<locations_stop){
denseSet_slider = denseSet;
while(denseSet_slider<dense_stop){ while(denseSet_slider<dense_stop){
(*denseSet_slider).values[*locations]+= *values; addS2D((*denseSet_slider).values, additive);
//*target+=*values;
denseSet_slider++; denseSet_slider++;
}
locations++;
values++;
} }
} }
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){ int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
...@@ -71,9 +70,15 @@ void directoryGrind(char *rootString){ ...@@ -71,9 +70,15 @@ void directoryGrind(char *rootString){
} }
while((files=readdir(directory))){ while((files=readdir(directory))){
while(!strcmp(files->d_name, ".") || !strcmp(files->d_name, "..")){ if(setjmp(readdirRecov)){
continue;
}
signal(SIGSEGV, readdirContingency);
//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name));
while(*(files->d_name)=='.'){
files = readdir(directory); files = readdir(directory);
} }
//signal(SIGSEGV, SIG_DFL);
if(files->d_type == DT_DIR){ if(files->d_type == DT_DIR){
strcpy(pathString, rootString); strcpy(pathString, rootString);
...@@ -135,5 +140,10 @@ void fileGrind(FILE* textFile){ ...@@ -135,5 +140,10 @@ void fileGrind(FILE* textFile){
} }
free(RIVArray); free(RIVArray);
free(aggregateRIV.locations); free(aggregateRIV.locations);
//free(aggregateRIV.values);
}
void readdirContingency(int sigNumber){
("readdir segfaulted, trying to recover");
longjmp(readdirRecov, 1);
} }
#include "RIVaccessories.h"
#include "RIVtools.h"
sparseRIV text2L2(char *text){
unsigned int blockSize;
char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
int displacement;
while(sscanf(text, "%99s%n", word, &displacement)){
text += displacement+1;
if(!displacement){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations((unsigned char*)word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file, untill frequency
* is needed to hold some more useful data point */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2(FILE *data){
unsigned int blockSize;
unsigned char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2Clean(FILE *data){
unsigned char word[100] = {0};
int *locations = RIVKey.h_tempBlock;
unsigned int blockSize;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
/* if the word is not clean, skip it */
if(!isWordClean((char*)word)){
continue;
}
blockSize = locationCount+NONZEROS;
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount<RIVKey.I2SThreshold){
/*direct method is faster on small datasets, but has geometric scaling on large datasets */
return consolidateI2SDirect(implicit, valueCount);
}else{
/* optimized for large datasets */
return consolidateI2SIndirect(implicit, valueCount);
}
}
void aggregateWord2D(denseRIV destination, char* word){
srand(wordtoSeed((unsigned char*)word));
for(int i=0; i<NONZEROS; i++){
destination.values[(rand()%RIVSIZE)] +=1;
destination.values[(rand()%RIVSIZE)] -= 1;
}
}
float cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0;
int n = comparator.count;
while(n){
n--;
/* we calculate the dot-product to derive the cosine
* comparing sparse to dense by index*/
//dot += values[i]*baseRIV.values[locations[i]];
dot += comparator.values[n] * baseRIV.values[comparator.locations[n]];
//printf("%d, %d, %d\n",baseRIV.values[comparator.locations[n]],comparator.values[n] , n);
}
/*dot divided by product of magnitudes */
float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
return cosine;
}
float getMagnitudeSparse(sparseRIV input){
unsigned long long int temp = 0;
int *values = input.values;
int *values_stop = values+input.count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
input.magnitude = sqrt(temp);
return input.magnitude;
}
denseRIV lexPull(char* word){
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
srand(wordtoSeed((unsigned char*)word));
int hash = rand()%CACHESIZE;
if(!strcmp(word, RIVKey.RIVCache[hash].name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
#endif /* CACHESIZE > 0 */
/* if not, attempt to pull the word data from lexicon file */
denseRIV output;
char pathString[200];
sprintf(pathString, "lexicon/%s", word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons?
output = denseAllocate();
}
strcpy(output.name, word);
return output;
}
int lexPush(denseRIV RIVout){
#if CACHESIZE == 0
/* if there is no cache, simply push to file */
fLexPush(RIVout);
return 0;
#else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated
* to the cache or the RIV was pushed out from under it,
* in which case it has already been pushed! move on*/
if(RIVout.cached){
return 0;
}
srand(wordtoSeed((unsigned char*)RIVout.name));
int hash = rand()%CACHESIZE;
if(!RIVKey.RIVCache[hash].cached){
/* if there is no word in this cache slot, push to cache instead of file */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return 0;
/*if the current RIV is more frequent than the RIV holding its slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
/* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]);
/* push the current RIV to cache */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return diag;
}else{
/* push current RIV to file */
fLexPush(RIVout);
}
return 0;
#endif /* CACHESIZE == 0 */
}
sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0};
denseRIV denseTemp;
// a temporary dense RIV is stored in the tempBlock
denseTemp.values = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
int count = 0;
while(fscanf(data, "%99s", word)){
count++;
if(feof(data)){
break;
}
if(!(*word)){
break;
}
// add word's L1 RIV to the accumulating implicit RIV
aggregateWord2D(denseTemp, (char*)word);
}
sparseRIV output = consolidateD2S(denseTemp.values);
// frequency records the number of words in this file
output.frequency = count;
output.boolean = 1;
return output;
}
#ifndef RIVTOOLS_H_
#define RIVTOOLS_H_
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "RIVLower.h"
#include "RIVaccessories.h"
/* lexPush writes a denseRIV to a file for permanent storage */
int lexPush(denseRIV RIVout);
/* lexPull reads an existing lexicon entry (under directory "lexicon")
* and creates a denseRIV with those attributes.
* if the file does not exist, it creates a 0 vector with the name of word
*/
denseRIV lexPull(char* word);
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each
* word contained
*/
sparseRIV fileToL2(FILE *input);
/* fileToL2Clean operates the same as fileToL2 butkeeps only words
* containing lowercase letters and the '_' symbol
* this is important if you will be lexPush-ing those words later
*/
sparseRIV fileToL2Clean(FILE *data);
/*filetoL2direct is an experiment in simplifying the process. it's slow */
sparseRIV fileToL2direct(FILE *data);
/*cosine determines the "similarity" between two RIVs. */
float cosCompare(denseRIV baseRIV, sparseRIV comparator);
/*currently unused */
sparseRIV wordtoL2(char* word);
/* converts an implicit RIV (a set of unvalued locations) into a formal
* sparse RIV. this chooses the best method to perform the consolidation
* and launches that function */
sparseRIV consolidateI2S(int *implicit, size_t valueCount);
/* like fileToL2 but takes a block of text */
sparseRIV text2L2(char *text);
float getMagnitudeSparse(sparseRIV input);
#endif
...@@ -5,93 +5,6 @@ ...@@ -5,93 +5,6 @@
#include "RIVLower.h" #include "RIVLower.h"
#include "RIVaccessories.h" #include "RIVaccessories.h"
/* RIV stands for Random Index Vector, referring to the method of generating
* the basic vectors that correspond to each word. each word has an algorithmically
* generated vector which represents it in this mathematical model, such that a word
* will produce the same vector each time it is encountered*[1]. this base
* vector will be referred to as a L1 vector or a barcode vector
*
* by summing these vectors, we can get a mathematical representation of
* a set of text. this summed vector will be referred to as an L2 vector
* or aggregate vector. in its simplest implimentation, an L2 vector
* representation of a document contains a model of the contents of the
* document, enabling us to compare direction and magnitude of document
* vectors to understand their relationships to each other.
*
* but the system we are really interested in is the ability to form
* context vectors
* a context vector is the sum of all (L1?) vectors that the word
* has been encountered in context with. from these context vectors
* certain patterns and relationships between words should emerge.
* what patterns? that is the key question we will try to answer
*
* [1] a word produces the same vector each time it is encountered only
* if the environment is the same, ie. RIVs are the same dimensionality
* nonzero count is the same. comparing vectors produced in different
* environments yields meaningless drivel and should be avoided
*
* [2] what exactly "context" means remains a major stumbling point.
* paragraphs? sentences? some potential analyses would expect a static
* sized context (the nearest 10 words?) in order to be sensible, but
* it may be that some other definition of context is the most valid for
* this model. we will have to find out.
*
* some notes:
*
* -sparseRIV vs. denseRIV (sparse vector vs. dense vector)
* the two primary data structures we will use to analyze these vectors
* each vector type is packed with some metadata
* (name, magnitude, frequency, flags)
*
* -denseRIV is a standard vector representation.
* each array index corresponds to a dimension
* each value corresponds to a measurement in that dimension
*
* -sparseRIV is vector representation optimized for largely empty vectors
* each data point is a location/value pair where the
* location represents array index
* value represents value in that array index
*
* if we have a sparsely populated dense vector (mostly 0s) such as:
*
* |0|0|5|0|0|0|0|0|4|0|
*
* there are only 2 values in a ten element array. this could, instead
* be represented as
*
* |2|8| array indexes
* |5|4| array values
* |2| record of size
*
* and so, a 10 element vector has been represented in only 5 integers
*
* this is important for memory use, of course, but also for rapid calculations
* if we have two vectors
*
* |0|0|5|0|0|0|0|0|4|0|
* |0|0|0|0|0|0|7|0|3|-2|
* and we wish to perform the dot product this will take 10 steps,
* 9 of which are either 0*0 = 0, or 0*x = 0
* if we instead have these represented as sparse vectors
* |2|8|
* |5|4|
* |2|
*
* |6|8|9|
* |7|3|-2|
* |3|
*
* we only need to search for matching location values
* or, better yet, if we use a hybrid analysis:
* |0|0|5|0|0|0|0|0|4|0|
* ___________/__/_/
* / / /
* |6|8|9|
* |7|3|-2|
* |3|
* we can simply access the dense vector by indexes held in the sparse vector
* reducing this operation to only 3 steps
/* lexPush writes a denseRIV to a file for permanent storage */ /* lexPush writes a denseRIV to a file for permanent storage */
...@@ -258,7 +171,7 @@ sparseRIV fileToL2Clean(FILE *data){ ...@@ -258,7 +171,7 @@ sparseRIV fileToL2Clean(FILE *data){
sparseRIV consolidateI2S(int *implicit, size_t valueCount){ sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount<RIVKey.I2SThreshold){ if(valueCount<RIVKey.I2SThreshold){
/* direct method is faster on small datasets, but has geometric scaling on large datasets */ /*direct method is faster on small datasets, but has geometric scaling on large datasets */
return consolidateI2SDirect(implicit, valueCount); return consolidateI2SDirect(implicit, valueCount);
}else{ }else{
/* optimized for large datasets */ /* optimized for large datasets */
...@@ -280,17 +193,16 @@ void aggregateWord2D(denseRIV destination, char* word){ ...@@ -280,17 +193,16 @@ void aggregateWord2D(denseRIV destination, char* word){
float cosCompare(denseRIV baseRIV, sparseRIV comparator){ float cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0; int dot = 0;
int n = comparator.count;
int *values = comparator.values; while(n){
int *locations = comparator.locations; n--;
int *locations_Stop = locations+comparator.count;
while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine /* we calculate the dot-product to derive the cosine
* comparing sparse to dense by index*/ * comparing sparse to dense by index*/
dot += (*values)*(*(baseRIV.values+(*locations))); //dot += values[i]*baseRIV.values[locations[i]];
locations++; dot += comparator.values[n] * baseRIV.values[comparator.locations[n]];
values++;
printf("%d, %d, %d\n",baseRIV.values[comparator.locations[n]],comparator.values[n] , n);
} }
/*dot divided by product of magnitudes */ /*dot divided by product of magnitudes */
float cosine = dot/(baseRIV.magnitude*comparator.magnitude); float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
...@@ -307,9 +219,9 @@ float getMagnitudeSparse(sparseRIV input){ ...@@ -307,9 +219,9 @@ float getMagnitudeSparse(sparseRIV input){
values++; values++;
} }
float magnitude = sqrt(temp);
input.magnitude = magnitude; input.magnitude = sqrt(temp);
return magnitude; return input.magnitude;
} }
denseRIV lexPull(char* word){ denseRIV lexPull(char* word){
...@@ -386,8 +298,8 @@ int lexPush(denseRIV RIVout){ ...@@ -386,8 +298,8 @@ int lexPush(denseRIV RIVout){
} }
return 0; return 0;
#endif /* CACHESIZE == 0 */ #endif /* CACHESIZE == 0 */
}
}
sparseRIV fileToL2direct(FILE *data){; sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0}; unsigned char word[100] = {0};
denseRIV denseTemp; denseRIV denseTemp;
......
#include <stdio.h> #include <stdio.h>
#define RIVSIZE 200 #define RIVSIZE 25000
#include "RIVtoolsCPUlinux.h" #include "RIVtoolsCPUlinux.h"
#include <time.h> #include <time.h>
...@@ -14,28 +14,21 @@ int main(){ ...@@ -14,28 +14,21 @@ int main(){
}if(numba1){ }if(numba1){
puts("numba1 opened successfully"); puts("numba1 opened successfully");
} }
sparseRIV first; sparseRIV first = fileToL2(numba1);
sparseRIV second; sparseRIV second = fileToL2(numba2);
int x=0; first.magnitude = getMagnitudeSparse(first);
denseRIV second2 = denseAllocate();
second2.values = addS2D(second2.values, second);
second2.magnitude = getMagnitudeSparse(second);
clock_t begintotal = clock(); clock_t begintotal = clock();
for(int i=0; i<iterations; i++){ for(int i=0; i<iterations; i++){
first = fileToL2(numba1); cosCompare(second2, first);
second = fileToL2(numba2);
x+= first.count+second.count;
} }
clock_t endtotal = clock(); clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC; double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
begintotal = clock();
for(int i=0; i<iterations; i++){
first = fileToL2direct(numba1);
second = fileToL2direct(numba2);
x+= first.count+second.count;
}
printf("%d", x);
endtotal = clock();
time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
} }
#include <stdio.h>
#include <stdlib.h>
#include <strsafe.h>
#define SEEDMASK 25214903917
struct RIVData{
int RIVsize;
int nonZeros;
long long int *masks;
int *h_tempBlock;
int *h_stagingBlock;
int *h_staging_slider;
int *h_staging_stop;
int *h_displacements;
int *d_OpenSlot;
int *d_SlotEnd;
float *d_magnitudes;
int thing;
}RIVKeyData;
typedef struct{
char name[100];
int *values;
int *locations;
int count;
int frequency;
float magnitude;
int boolean;
}sparseRIV;
sparseRIV FileToL2(FILE *data);
void consolidateD2S(sparseRIV *destination, int *denseInput);
void setKeyData(int RIVsize, int nonZeros, int blockSize);
int* mapS2D(int * destination, sparseRIV input);
int* makeSparseLocations(int *seeds, int seedCount);
void makeSeeds(unsigned char* word, int **seeds, int *seedCount);
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold);
void getMagnitudes(sparseRIV *inputs, int RIVCount);
int *mapI2D(int *locations, int seedCount);
sparseRIV text2L2(unsigned char *text);
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);
sparseRIV FileToL2(FILE *data){
unsigned char *word = (unsigned char*)calloc(2000, 1);
int *seeds = RIVKeyData.h_tempBlock;
int seedCount = 0;
while(fscanf(data, "%s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
makeSeeds(word, &seeds, &seedCount);
memset(word, 0, 2000);
}
int *locations = makeSparseLocations(seeds, seedCount);
//printf("mcshittles");
int *L2dense;
L2dense = mapI2D(locations, seedCount);
sparseRIV output;
//printf("tits");
consolidateD2S( &output, L2dense);
free(L2dense);
output.boolean = 1;
RIVKeyData.thing++;
return output;
}
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold){
int *baseDenseRIV = RIVKeyData.h_tempBlock;
mapS2D(baseDenseRIV, baseRIV);
float *outputs = (float*)malloc((multiplierCount)* sizeof(float));
float *output_slider = outputs;
sparseRIV *multipliersStop = multipliers+multiplierCount;
float minsize = baseRIV.magnitude * .75;
float maxsize = baseRIV.magnitude * 1.25;
while(multipliers<multipliersStop){
if(((*multipliers).boolean) /*&& (((*multipliers).magnitude < maxsize) && ((*multipliers).magnitude > minsize))*/){
int dot = 0;
int *values = (*multipliers).values;
int *locations = (*multipliers).locations;
int *locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){
dot += (*values)*(*(baseDenseRIV+(*locations)));
locations++;
values++;
}
*output_slider= dot/((baseRIV.magnitude)*((*multipliers).magnitude));
if(*output_slider>=threshold){
printf("%s\t%s\n%f\n", (*multipliers).name, baseRIV.name, *output_slider);
(*multipliers).boolean = 0;
//RIVKeyData.thing ++;
}
}
multipliers++;
output_slider++;
}
return outputs;
}
void getMagnitudes(sparseRIV *inputs, int RIVCount){
for(int i=0; i<RIVCount; i++){
int temp = 0;
int *values = inputs[i].values;
int *values_stop = values+inputs[i].count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
float magnitude = sqrt(temp);
inputs[i].magnitude = magnitude;
//printf("magnitude = %f, \n", magnitude);
}
}
int* mapS2D(int* destination, sparseRIV input){
memset(destination, 0, RIVKeyData.RIVsize*sizeof(int));
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
while(locations_slider<locations_stop){
destination[*locations_slider] = *values_slider;
locations_slider++;
values_slider++;
}
//HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_OpenSlot, destination, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyHostToDevice));
return destination;
}
int* mapI2D(int *locations, int valueCount){
int *destination = (int*)calloc(RIVKeyData.RIVsize,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
int value = 1;
while(locations_slider<locations_stop){
destination[*locations_slider] +=value;
locations_slider++;
value = (value == 1)? -1: 1;
}
return destination;
}
void consolidateD2S(sparseRIV *destination, int *denseInput){
int count = 0;
(*destination).locations = (int*) malloc(RIVKeyData.RIVsize*sizeof(int));
(*destination).values = (int*) malloc(RIVKeyData.RIVsize*sizeof(int));
for(int i=0; i<RIVKeyData.RIVsize; i++){
if(denseInput[i]){
(*destination).locations[count] = i;
(*destination).values[count] = denseInput[i];
count++;
}
}
destination->count = count;
(*destination).locations = (int*) realloc((*destination).locations, (*destination).count*sizeof(int));
(*destination).values = (int*) realloc((*destination).values, (*destination).count*sizeof(int));
}
void setKeyData(int RIVsize, int nonZeros, int blockSize){
RIVKeyData.RIVsize = RIVsize;
if(nonZeros%2){
printf("your nonZeros must be an even number");
nonZeros++;
printf(", changed to %d", nonZeros);
}
RIVKeyData.nonZeros = nonZeros;
RIVKeyData.masks = (long long int*)malloc(nonZeros*sizeof(long long int));
for(int i = 0; i<nonZeros; i++){
RIVKeyData.masks[i] = SEEDMASK>>(5*i);
}
RIVKeyData.h_tempBlock = (int*)malloc(blockSize*sizeof(int));
//RIVKeyData.h_stagingBlock = (int*)malloc(blockSize*sizeof(int));
//RIVKeyData.h_staging_slider = RIVKeyData.h_stagingBlock;
RIVKeyData.thing = 0;
}
void makeSeeds(unsigned char* word, int **seeds, int *seedCount){
int i=0;
int seedbase = 0;
while(*word){
seedbase += (*(word))<<(i*5);
word++;
i++;
}
int *seedTrack = (*seeds)+*seedCount;
for(i =0 ; i<RIVKeyData.nonZeros; i++){
*seedTrack = (seedbase>>i)+(3*i);
seedTrack++;
}
*seedCount+=RIVKeyData.nonZeros;
return;
}
int* makeSparseLocations(int* seeds, int seedCount){
int *locations = RIVKeyData.h_tempBlock;
int *locations_slider = locations;
int *seeds_stop = seeds+seedCount;
long long int *mask = RIVKeyData.masks;
long long int *mask_stop = mask+RIVKeyData.nonZeros;
while(seeds<seeds_stop){
*locations_slider =(((*seeds)^(*mask)) & 2147483647) %(RIVKeyData.RIVsize);
mask++;
locations_slider++;
seeds++;
if(!(mask<mask_stop)) mask-=RIVKeyData.nonZeros;
}
return locations;
}
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word){
unsigned char *word_slider = word;
while(*(*string)){
if(*(*string) == ' ') {
(*string)++;
break;
}
*word_slider = *(*string);
word_slider++;
(*string)++;
}
*word_slider = 0;
return word;
}
sparseRIV text2L2(unsigned char *text){
unsigned char *word = (unsigned char*)calloc(2000, 1);
int *seeds = ( int*)malloc(RIVKeyData.nonZeros*sizeof( int));
unsigned char *text_slider = text;
int seedCount = 0;
while(*text_slider){
sscanAdvance(&text_slider, word);
makeSeeds(word, &seeds, &seedCount);
memset(word, 0, 2000);
}
int *locations = makeSparseLocations(seeds, seedCount);
int *L2dense;
L2dense = mapI2D(locations, seedCount);
free(locations);
sparseRIV output;
consolidateD2S(&output, L2dense);
free(seeds);
return output;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment