Commit 3179d5fd by etcart

added comments and explanations

parent d78631fd
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <signal.h> #include <signal.h>
#include <unistd.h> #include <unistd.h>
#include <math.h> #include <math.h>
/* RIVSIZE macro defines the dimensionality off the RIVs we will use /* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically * 25000 is the standard, but can be redefined specifically
*/ */
#ifndef RIVSIZE #ifndef RIVSIZE
#define RIVSIZE 25000 #define RIVSIZE 25000
#endif #endif
#if RIVSIZE<0 #if RIVSIZE<0
#error "RIVSIZE must be a positive number (preferably a large positive)" #error "RIVSIZE must be a positive number (preferably a large positive)"
#endif #endif
/* NONZeros macro defines the number of non-zero values that will be generated /* NONZeros macro defines the number of non-zero values that will be generated
* for any level one (barcode) RIV. 2 is simple and lightweight to begin * for any level one (barcode) RIV. 2 is simple and lightweight to begin
*/ */
#ifndef NONZEROS #ifndef NONZEROS
#define NONZEROS 2 #define NONZEROS 2
#endif #endif
#if NONZEROS%2 || NONZEROS<1 #if NONZEROS%2 || NONZEROS<1
#error "NONZEROS must be an even, greater than 0 number" #error "NONZEROS must be an even, greater than 0 number"
#endif #endif
/* CACHESIZE macro defines the number of RIVs the system will cache. /* CACHESIZE macro defines the number of RIVs the system will cache.
* a larger cache means more memory consumption, but will also be significantly * a larger cache means more memory consumption, but will also be significantly
* faster in aggregation and reading applications. doesn't affect systems * faster in aggregation and reading applications. doesn't affect systems
* that do not use lexpull/push * that do not use lexpull/push
*/ */
#ifndef CACHESIZE #ifndef CACHESIZE
#define CACHESIZE 20 #define CACHESIZE 20
#endif #endif
#if CACHESIZE<0 #if CACHESIZE<0
#error "CACHESIZE cannot be a negative number" #error "CACHESIZE cannot be a negative number"
#endif #endif
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s /* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default * as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation. * unless we are doing long term RIV aggregation.
* specifically, a sparseRIV contains a pair of arrays, * specifically, a sparseRIV contains a pair of arrays,
* containing locations and values, where pairs are found in like array * containing locations and values, where pairs are found in like array
* indices. * indices.
*/ */
typedef struct{ typedef struct{
char name[100]; char name[100];
int *values; int *values;
int *locations; int *locations;
size_t count; size_t count;
unsigned int frequency; unsigned int frequency;
float magnitude; float magnitude;
int boolean; int boolean;
}sparseRIV; }sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors /* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector * this is rarely the case, but its primary use is for performing vector
* math, as comparisons and arithmetic between vectors are ideally * math, as comparisons and arithmetic between vectors are ideally
* performed between sparse and dense (hetero-arithmetic) * performed between sparse and dense (hetero-arithmetic)
*/ */
typedef struct{ typedef struct{
char name[100]; char name[100];
int* values; int* values;
int* frequency; int* frequency;
float magnitude; float magnitude;
int cached; int cached;
}denseRIV; }denseRIV;
/*RIVKey, holds globally important data that should not be changed partway through /*RIVKey, holds globally important data that should not be changed partway through
* first function call in the program should always be: * first function call in the program should always be:
* RIVinit(); * RIVinit();
* this will set these variables, check for incompatible choices, and open up * this will set these variables, check for incompatible choices, and open up
* memory blocks which the system will use in the background * memory blocks which the system will use in the background
*/ */
struct RIVData{ struct RIVData{
int I2SThreshold; int I2SThreshold;
int *h_tempBlock; int *h_tempBlock;
int tempSize; int tempSize;
int thing; int thing;
denseRIV RIVCache[CACHESIZE]; denseRIV RIVCache[CACHESIZE];
}static RIVKey; }static RIVKey;
/* RIVinit should be the first function called in any usage of this library /* RIVinit should be the first function called in any usage of this library
* it sets global variables that practically all functions will reference, * it sets global variables that practically all functions will reference,
* it checks that your base parameters are valid, and allocates memory for * it checks that your base parameters are valid, and allocates memory for
* the functions to use, so that we can move fast with rare allocations. * the functions to use, so that we can move fast with rare allocations.
*/ */
void RIVInit(); void RIVInit();
/* RIVCleanup should always be called to close a RIV program. it frees /* RIVCleanup should always be called to close a RIV program. it frees
* blocks allocated by RIVinit, and dumps the cached data to appropriate lexicon files * blocks allocated by RIVinit, and dumps the cached data to appropriate lexicon files
*/ */
void RIVCleanup(); void RIVCleanup();
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with /*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does not automatically carry metadata, which must be assigned * all 0s removed. it does not automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't * to a denseRIV after the fact. often denseRIVs are only temporary, and don't
* need to carry metadata * need to carry metadata
*/ */
sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
/* mapS2D expands a sparseRIV out to denseRIV values, filling array locations /* mapS2D expands a sparseRIV out to denseRIV values, filling array locations
* based on location-value pairs * based on location-value pairs
*/ */
/* makeSparseLocations must be called repeatedly in the processing of a /* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file * file to produce a series of locations from the words of the file
* this produces an "implicit" RIV which can be used with the mapI2D function * this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV. * to create a denseRIV.
*/ */
void makesparseLocations(unsigned char* word, int *seeds, size_t seedCount); void makesparseLocations(unsigned char* word, int *seeds, size_t seedCount);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file, /* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexpush", * saving it for long-term aggregation. function is called by "lexpush",
* which is what users should actually use. lexPush, unlike fLexPush, * which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization * has cache logic under the hood for speed and harddrive optimization
*/ */
int fLexPush(denseRIV RIVout); int fLexPush(denseRIV RIVout);
denseRIV fLexPull(FILE* lexWord); denseRIV fLexPull(FILE* lexWord);
/* creates a standard seed from the characters in a word, hopefully unique */ /* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(unsigned char* word); int wordtoSeed(unsigned char* word);
/* mapI2D maps an "implicit RIV" that is, an array of index values, /* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations) * arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering * it assigns, in the process of mapping, values according to ordering
*/ */
int* mapI2D(int *locations, size_t seedCount); int* mapI2D(int *locations, size_t seedCount);
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount); sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount); sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount);
int cacheDump(); int cacheDump();
int* addI2D(int* destination, int* locations, size_t seedCount); int* addI2D(int* destination, int* locations, size_t seedCount);
denseRIV denseAllocate(); denseRIV denseAllocate();
void signalSecure(int signum, siginfo_t *si, void* arg); void signalSecure(int signum, siginfo_t *si, void* arg);
/* begin definitions */ /* begin definitions */
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = input.locations; int *locations_slider = input.locations;
int *values_slider = input.values; int *values_slider = input.values;
int *locations_stop = locations_slider+input.count; int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */ /* apply values at an index based on locations */
while(locations_slider<locations_stop){ while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider; destination[*locations_slider] += *values_slider;
locations_slider++; locations_slider++;
values_slider++; values_slider++;
} }
return destination; return destination;
} }
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int)); int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations; int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount; int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */ /*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){ while(locations_slider<locations_stop){
destination[*locations_slider] +=1; destination[*locations_slider] +=1;
locations_slider++; locations_slider++;
destination[*locations_slider] -= 1; destination[*locations_slider] -= 1;
locations_slider++; locations_slider++;
} }
return destination; return destination;
} }
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations; int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount; int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */ /*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){ while(locations_slider<locations_stop){
destination[*locations_slider] +=1; destination[*locations_slider] +=1;
locations_slider++; locations_slider++;
destination[*locations_slider] -= 1; destination[*locations_slider] -= 1;
locations_slider++; locations_slider++;
} }
return destination; return destination;
} }
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){ sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount); int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp); sparseRIV sparseOut = consolidateD2S(denseTemp);
free(denseTemp); free(denseTemp);
return sparseOut; return sparseOut;
} }
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){ sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
sparseRIV sparseOut; sparseRIV sparseOut;
int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE; int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE; int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
sparseOut.count = 0; sparseOut.count = 0;
int add = 1; int add = 1;
int found; int found;
for(int i=0; i<valueCount; i++){ for(int i=0; i<valueCount; i++){
found = 0; found = 0;
for(int j=0; j<sparseOut.count; j++){ for(int j=0; j<sparseOut.count; j++){
if(implicit[i] == locationsTemp[j]){ if(implicit[i] == locationsTemp[j]){
valuesTemp[i] += add; valuesTemp[i] += add;
add *= -1; add *= -1;
found = 1; found = 1;
} }
} }
if(!found){ if(!found){
locationsTemp[sparseOut.count] = implicit[i]; locationsTemp[sparseOut.count] = implicit[i];
valuesTemp[sparseOut.count] = add; valuesTemp[sparseOut.count] = add;
sparseOut.count++; sparseOut.count++;
add*= -1; add*= -1;
} }
} }
sparseOut.locations = malloc(2*sparseOut.count*sizeof(int)); sparseOut.locations = malloc(2*sparseOut.count*sizeof(int));
sparseOut.values = sparseOut.locations+sparseOut.count; sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, 2*sparseOut.count*sizeof(int)); memcpy(sparseOut.locations, locationsTemp, 2*sparseOut.count*sizeof(int));
return sparseOut; return sparseOut;
} }
sparseRIV consolidateD2S(int *denseInput){ sparseRIV consolidateD2S(int *denseInput){
sparseRIV output; sparseRIV output;
output.count = 0; output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */ /* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE; int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE; int* values = locations+RIVSIZE;
int* locations_slider = locations; int* locations_slider = locations;
int* values_slider = values; int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){ for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */ /* act only on non-zeros */
if(denseInput[i]){ if(denseInput[i]){
/* assign index to locations */ /* assign index to locations */
*(locations_slider++) = i; *(locations_slider++) = i;
/* assign value to values */ /* assign value to values */
*(values_slider++) = denseInput[i]; *(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */ /* track size of forming sparseRIV */
output.count++; output.count++;
} }
} }
/* a slot is opened for the locations/values pair */ /* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int)); output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){ if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge printf("memory allocation failed"); //*TODO enable fail point knowledge
} }
/* copy locations values into opened slot */ /* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int)); memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count; output.values = output.locations + output.count;
/* copy values into opened slot */ /* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int)); memcpy(output.values, values, output.count*sizeof(int));
return output; return output;
} }
void RIVInit(){ void RIVInit(){
RIVKey.I2SThreshold = sqrt(RIVSIZE); RIVKey.I2SThreshold = sqrt(RIVSIZE);
/* open a slot at least large enough for worst case handling of /* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */ * sparse to dense conversion. may be enlarged by filetoL2 functions */
struct sigaction action; struct sigaction action;
action.sa_sigaction = signalSecure; action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO; action.sa_flags = SA_SIGINFO;
//for(int i=1; i<27; i++){ //for(int i=1; i<27; i++){
sigaction(11,&action,NULL); sigaction(11,&action,NULL);
//} //}
RIVKey.h_tempBlock = (int*)malloc(3*RIVSIZE*sizeof(int)); RIVKey.h_tempBlock = (int*)malloc(3*RIVSIZE*sizeof(int));
RIVKey.tempSize = 3*RIVSIZE; RIVKey.tempSize = 3*RIVSIZE;
RIVKey.thing = 0; RIVKey.thing = 0;
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */ /* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE); memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE);
} }
void RIVCleanup(){ void RIVCleanup(){
if(cacheDump()){ if(cacheDump()){
puts("cache dump failed, some lexicon data was lost"); puts("cache dump failed, some lexicon data was lost");
} }
free(RIVKey.h_tempBlock); free(RIVKey.h_tempBlock);
} }
int wordtoSeed(unsigned char* word){ int wordtoSeed(unsigned char* word){
int i=0; int i=0;
int seed = 0; int seed = 0;
while(*word){ while(*word){
/* left-shift 5 each time *should* make seeds unique to words */ /* left-shift 5 each time *should* make seeds unique to words */
seed += (*(word))<<(i*5); seed += (*(word))<<(i*5);
word++; word++;
i++; i++;
} }
return seed; return seed;
} }
void makeSparseLocations(unsigned char* word, int *locations, size_t count){ void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count; locations+=count;
srand(wordtoSeed(word)); srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS; int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){ while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */ /* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE; *locations = rand()%RIVSIZE;
locations++; locations++;
*locations = rand()%RIVSIZE; *locations = rand()%RIVSIZE;
locations++; locations++;
} }
return; return;
} }
int fLexPush(denseRIV RIVout){ int fLexPush(denseRIV RIVout){
char pathString[200] = {0}; char pathString[200] = {0};
/* word data will be placed in a (new?) file under the lexicon directory /* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */ * in a file named after the word itself */
sprintf(pathString, "lexicon/%s", RIVout.name); sprintf(pathString, "lexicon/%s", RIVout.name);
FILE *lexWord = fopen(pathString, "wb"); FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){ if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString); printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1; return 1;
} }
sparseRIV temp = consolidateD2S(RIVout.values); sparseRIV temp = consolidateD2S(RIVout.values);
fwrite(&temp.count, 1, sizeof(size_t), lexWord); fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(float), lexWord); fwrite(RIVout.frequency, 1, sizeof(float), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord); fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord); fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord); fwrite(temp.values, temp.count, sizeof(int), lexWord);
fclose(lexWord); fclose(lexWord);
free(RIVout.values); free(RIVout.values);
free(temp.locations); free(temp.locations);
return 0; return 0;
} }
denseRIV fLexPull(FILE* lexWord){ denseRIV fLexPull(FILE* lexWord){
denseRIV output; denseRIV output;
sparseRIV temp; sparseRIV temp;
output.values = calloc( (RIVSIZE+1) ,sizeof(int)); output.values = calloc( (RIVSIZE+1) ,sizeof(int));
output.frequency = output.values+RIVSIZE; output.frequency = output.values+RIVSIZE;
int diagnostic = 0; int diagnostic = 0;
fread(&temp.count, 1, sizeof(size_t), lexWord); fread(&temp.count, 1, sizeof(size_t), lexWord);
diagnostic += fread(&temp.frequency, 1, sizeof(int), lexWord); diagnostic += fread(&temp.frequency, 1, sizeof(int), lexWord);
diagnostic += fread(&(temp.magnitude), 1, sizeof(int), lexWord); diagnostic += fread(&(temp.magnitude), 1, sizeof(int), lexWord);
temp.locations = malloc(temp.count*2*sizeof(int)); temp.locations = malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count; temp.values = temp.locations+temp.count;
diagnostic += fread(temp.locations, temp.count, sizeof(int), lexWord); diagnostic += fread(temp.locations, temp.count, sizeof(int), lexWord);
diagnostic += fread(temp.values, temp.count, sizeof(int), lexWord); diagnostic += fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output.values, temp); addS2D(output.values, temp);
*(output.frequency) = temp.frequency; *(output.frequency) = temp.frequency;
output.magnitude = temp.magnitude; output.magnitude = temp.magnitude;
free(temp.locations); free(temp.locations);
output.cached = 0; output.cached = 0;
return output; return output;
} }
void signalSecure(int signum, siginfo_t *si, void* arg){ void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){ if(cacheDump()){
puts("cache dump failed, some lexicon data lost"); puts("cache dump failed, some lexicon data lost");
}else{ }else{
puts("cache dumped successfully"); puts("cache dumped successfully");
} }
signal(signum, SIG_DFL); signal(signum, SIG_DFL);
kill(getpid(), signum); kill(getpid(), signum);
} }
int cacheDump(){ int cacheDump(){
int flag = 0; int flag = 0;
denseRIV* cache_slider = RIVKey.RIVCache; denseRIV* cache_slider = RIVKey.RIVCache;
denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE; denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE;
while(cache_slider<cache_stop){ while(cache_slider<cache_stop){
if((*cache_slider).cached){ if((*cache_slider).cached){
flag += fLexPush(*cache_slider); flag += fLexPush(*cache_slider);
} }
cache_slider++; cache_slider++;
} }
return flag; return flag;
} }
denseRIV denseAllocate(){ denseRIV denseAllocate(){
/* allocates a 0 vector */ /* allocates a 0 vector */
denseRIV output; denseRIV output;
output.values = calloc(RIVSIZE+1, sizeof(int)); output.values = calloc(RIVSIZE+1, sizeof(int));
/* for compact memory use, frequency is placed immediately after values */ /* for compact memory use, frequency is placed immediately after values */
output.frequency = output.values+RIVSIZE; output.frequency = output.values+RIVSIZE;
output.magnitude = 0; output.magnitude = 0;
output.cached = 0; output.cached = 0;
return output; return output;
} }
/*TODO add a simplified free function*/ /*TODO add a simplified free function*/
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <signal.h> #include <signal.h>
#include <unistd.h> #include <unistd.h>
#include <math.h> #include <math.h>
/* RIVSIZE macro defines the dimensionality off the RIVs we will use /* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically * 25000 is the standard, but can be redefined specifically
*/ */
#ifndef RIVSIZE #ifndef RIVSIZE
#define RIVSIZE 25000 #define RIVSIZE 25000
#endif #endif
/* NONZeros macro defines the number of non-zero values that will be generated /* NONZeros macro defines the number of non-zero values that will be generated
* for any level one (barcode) RIV. 2 is simple and lightweight to begin * for any level one (barcode) RIV. 2 is simple and lightweight to begin
*/ */
#ifndef NONZEROS #ifndef NONZEROS
#define NONZEROS 2 #define NONZEROS 2
#endif #endif
/* CACHESIZE macro defines the number of RIVs the system will cache. /* CACHESIZE macro defines the number of RIVs the system will cache.
* a larger cache means more memory consumption, but will also be significantly * a larger cache means more memory consumption, but will also be significantly
* faster in aggregation and reading applications. doesn't affect systems * faster in aggregation and reading applications. doesn't affect systems
* that do not use lexpull/push * that do not use lexpull/push
*/ */
#ifndef CACHESIZE #ifndef CACHESIZE
#define CACHESIZE 20 #define CACHESIZE 20
#endif #endif
#define CACHED 0x02 #define CACHED 0x02
#define SPARSE 0x01 #define SPARSE 0x01
#define AVAILABLE 0x04 #define AVAILABLE 0x04
typedef struct{ typedef struct{
char name[100]; char name[100];
int *values; int *values;
int *locations; int *locations;
size_t count; size_t count;
unsigned int* frequency; unsigned int* frequency;
float magnitude; float magnitude;
int cached; int cached;
int boolean; int boolean;
int flags; int flags;
}RIV; }RIV;
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s /* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default * as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation. * unless we are doing long term RIV aggregation.
* specifically, a sparseRIV contains a pair of arrays, * specifically, a sparseRIV contains a pair of arrays,
* containing locations and values, where pairs are found in like array * containing locations and values, where pairs are found in like array
* indices. * indices.
*/ */
typedef RIV sparseRIV; typedef RIV sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors /* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector * this is rarely the case, but its primary use is for performing vector
* math, as comparisons and arithmetic between vectors are ideally * math, as comparisons and arithmetic between vectors are ideally
* performed between sparse and dense (hetero-arithmetic) * performed between sparse and dense (hetero-arithmetic)
*/ */
typedef RIV denseRIV; typedef RIV denseRIV;
/*RIVKey, holds globally important data that should not be changed partway through /*RIVKey, holds globally important data that should not be changed partway through
* first function call in the program should always be: * first function call in the program should always be:
* RIVinit(); * RIVinit();
* this will set these variables, check for incompatible choices, and open up * this will set these variables, check for incompatible choices, and open up
* memory blocks which the system will use in the background * memory blocks which the system will use in the background
*/ */
struct RIVData{ struct RIVData{
size_t RIVsize; size_t RIVsize;
int nonZeros; int nonZeros;
int I2SThreshold; int I2SThreshold;
int *h_tempBlock; int *h_tempBlock;
int tempSize; int tempSize;
int thing; int thing;
denseRIV* RIVCache; denseRIV* RIVCache;
int cacheSize; int cacheSize;
}static RIVKey; }static RIVKey;
/* RIVinit should be the first function called in any usage of this library /* RIVinit should be the first function called in any usage of this library
* it sets global variables that practically all functions will reference, * it sets global variables that practically all functions will reference,
* it checks that your base parameters are valid, and allocates memory for * it checks that your base parameters are valid, and allocates memory for
* the functions to use, so that we can move fast with rare allocations. * the functions to use, so that we can move fast with rare allocations.
*/ */
void RIVInit(); void RIVInit();
/* RIVCleanup should always be called to close a RIV program. it frees /* RIVCleanup should always be called to close a RIV program. it frees
* blocks allocated by RIVinit, and dumps the cached data to appropriate lexicon files * blocks allocated by RIVinit, and dumps the cached data to appropriate lexicon files
*/ */
void RIVCleanup(); void RIVCleanup();
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with /*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does not automatically carry metadata, which must be assigned * all 0s removed. it does not automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't * to a denseRIV after the fact. often denseRIVs are only temporary, and don't
* need to carry metadata * need to carry metadata
*/ */
sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
/* mapS2D expands a sparseRIV out to denseRIV values, filling array locations /* mapS2D expands a sparseRIV out to denseRIV values, filling array locations
* based on location-value pairs * based on location-value pairs
*/ */
/* makeSparseLocations must be called repeatedly in the processing of a /* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file * file to produce a series of locations from the words of the file
* this produces an "implicit" RIV which can be used with the mapI2D function * this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV. * to create a denseRIV.
*/ */
void makesparseLocations(unsigned char* word, int *seeds, size_t seedCount); void makesparseLocations(unsigned char* word, int *seeds, size_t seedCount);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file, /* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexpush", * saving it for long-term aggregation. function is called by "lexpush",
* which is what users should actually use. lexPush, unlike fLexPush, * which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization * has cache logic under the hood for speed and harddrive optimization
*/ */
int fLexPush(denseRIV RIVout); int fLexPush(denseRIV RIVout);
denseRIV fLexPull(FILE* lexWord); denseRIV fLexPull(FILE* lexWord);
/* creates a standard seed from the characters in a word, hopefully unique */ /* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(unsigned char* word); int wordtoSeed(unsigned char* word);
/* mapI2D maps an "implicit RIV" that is, an array of index values, /* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations) * arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering * it assigns, in the process of mapping, values according to ordering
*/ */
int* mapI2D(int *locations, size_t seedCount); int* mapI2D(int *locations, size_t seedCount);
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount); sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount); sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount);
int cacheDump(); int cacheDump();
int* addI2D(int* destination, int* locations, size_t seedCount); int* addI2D(int* destination, int* locations, size_t seedCount);
denseRIV denseAllocate(); denseRIV denseAllocate();
void signalSecure(int signum, siginfo_t *si, void* arg); void signalSecure(int signum, siginfo_t *si, void* arg);
/* begin definitions */ /* begin definitions */
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = input.locations; int *locations_slider = input.locations;
int *values_slider = input.values; int *values_slider = input.values;
int *locations_stop = locations_slider+input.count; int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */ /* apply values at an index based on locations */
while(locations_slider<locations_stop){ while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider; destination[*locations_slider] += *values_slider;
locations_slider++; locations_slider++;
values_slider++; values_slider++;
} }
return destination; return destination;
} }
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int)); int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations; int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount; int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */ /*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){ while(locations_slider<locations_stop){
destination[*locations_slider] +=1; destination[*locations_slider] +=1;
locations_slider++; locations_slider++;
destination[*locations_slider] -= 1; destination[*locations_slider] -= 1;
locations_slider++; locations_slider++;
} }
return destination; return destination;
} }
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations; int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount; int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */ /*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){ while(locations_slider<locations_stop){
destination[*locations_slider] +=1; destination[*locations_slider] +=1;
locations_slider++; locations_slider++;
destination[*locations_slider] -= 1; destination[*locations_slider] -= 1;
locations_slider++; locations_slider++;
} }
return destination; return destination;
} }
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){ sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount); int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp); sparseRIV sparseOut = consolidateD2S(denseTemp);
/* sparseOut is flagged as sparse in consolidate step */
free(denseTemp); free(denseTemp);
return sparseOut; return sparseOut;
} }
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){ sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
sparseRIV sparseOut; sparseRIV sparseOut;
int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE; int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE; int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
sparseOut.count = 0; sparseOut.count = 0;
int add = 1; int add = 1;
int found; int found;
for(int i=0; i<valueCount; i++){ for(int i=0; i<valueCount; i++){
found = 0; found = 0;
for(int j=0; j<sparseOut.count; j++){ for(int j=0; j<sparseOut.count; j++){
if(implicit[i] == locationsTemp[j]){ if(implicit[i] == locationsTemp[j]){
valuesTemp[i] += add; valuesTemp[i] += add;
add *= -1; add *= -1;
found = 1; found = 1;
} }
} }
if(!found){ if(!found){
locationsTemp[sparseOut.count] = implicit[i]; locationsTemp[sparseOut.count] = implicit[i];
valuesTemp[sparseOut.count] = add; valuesTemp[sparseOut.count] = add;
sparseOut.count++; sparseOut.count++;
add*= -1; add*= -1;
} }
} }
sparseOut.locations = malloc(2*sparseOut.count*sizeof(int)); sparseOut.locations = malloc(2*sparseOut.count*sizeof(int));
sparseOut.values = sparseOut.locations+sparseOut.count; sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, 2*sparseOut.count*sizeof(int)); memcpy(sparseOut.locations, locationsTemp, 2*sparseOut.count*sizeof(int));
sparseOut.flags |= SPARSE; sparseOut.flags |= SPARSE;
return sparseOut; return sparseOut;
} }
sparseRIV consolidateD2S(int *denseInput){ sparseRIV consolidateD2S(int *denseInput){
sparseRIV output; sparseRIV output;
output.count = 0; output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */ /* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE; int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE; int* values = locations+RIVSIZE;
int* locations_slider = locations; int* locations_slider = locations;
int* values_slider = values; int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){ for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */ /* act only on non-zeros */
if(denseInput[i]){ if(denseInput[i]){
/* assign index to locations */ /* assign index to locations */
*(locations_slider++) = i; *(locations_slider++) = i;
/* assign value to values */ /* assign value to values */
*(values_slider++) = denseInput[i]; *(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */ /* track size of forming sparseRIV */
output.count++; output.count++;
} }
} }
/* a slot is opened for the locations/values pair */ /* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int)); output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){ if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge printf("memory allocation failed"); //*TODO enable fail point knowledge
} }
/* copy locations values into opened slot */ /* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int)); memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count; output.values = output.locations + output.count;
/* copy values into opened slot */ /* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int)); memcpy(output.values, values, output.count*sizeof(int));
output.flags |= SPARSE; output.flags |= SPARSE;
return output; return output;
} }
void RIVInit(){ void RIVInit(){
RIVKey.I2SThreshold = sqrt(RIVSIZE); RIVKey.I2SThreshold = sqrt(RIVSIZE);
/* open a slot at least large enough for worst case handling of /* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */ * sparse to dense conversion. may be enlarged by filetoL2 functions */
struct sigaction action; struct sigaction action;
action.sa_sigaction = signalSecure; action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO; action.sa_flags = SA_SIGINFO;
//for(int i=1; i<27; i++){ //for(int i=1; i<27; i++){
sigaction(11,&action,NULL); sigaction(11,&action,NULL);
//} //}
RIVKey.h_tempBlock = (int*)malloc(3*RIVSIZE*sizeof(int)); RIVKey.h_tempBlock = (int*)malloc(3*RIVSIZE*sizeof(int));
RIVKey.tempSize = 3*RIVSIZE; RIVKey.tempSize = 3*RIVSIZE;
RIVKey.thing = 0; RIVKey.thing = 0;
RIVKey.cacheSize = CACHESIZE; RIVKey.cacheSize = CACHESIZE;
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */ /* open a slot for a cache of dense RIVs, optimized for frequent accesses */
RIVKey.RIVCache = (denseRIV*)calloc(RIVKey.cacheSize,sizeof(denseRIV)); RIVKey.RIVCache = (denseRIV*)calloc(RIVKey.cacheSize,sizeof(denseRIV));
} }
void RIVCleanup(){ void RIVCleanup(){
if(cacheDump()){ if(cacheDump()){
puts("cache dump failed, some lexicon data was lost"); puts("cache dump failed, some lexicon data was lost");
} }
#if CACHESIZE > 0 #if CACHESIZE > 0
free(RIVKey.RIVCache); free(RIVKey.RIVCache);
#endif #endif
free(RIVKey.h_tempBlock); free(RIVKey.h_tempBlock);
} }
int wordtoSeed(unsigned char* word){ int wordtoSeed(unsigned char* word){
int i=0; int i=0;
int seed = 0; int seed = 0;
while(*word){ while(*word){
/* left-shift 5 each time *should* make seeds unique to words */ /* left-shift 5 each time *should* make seeds unique to words */
seed += (*(word))<<(i*5); seed += (*(word))<<(i*5);
word++; word++;
i++; i++;
} }
return seed; return seed;
} }
void makeSparseLocations(unsigned char* word, int *locations, size_t count){ void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count; locations+=count;
srand(wordtoSeed(word)); srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS; int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){ while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */ /* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE; *locations = rand()%RIVSIZE;
locations++; locations++;
*locations = rand()%RIVSIZE; *locations = rand()%RIVSIZE;
locations++; locations++;
} }
return; return;
} }
int fLexPush(denseRIV RIVout){ int fLexPush(denseRIV RIVout){
char pathString[200] = {0}; char pathString[200] = {0};
/* word data will be placed in a (new?) file under the lexicon directory /* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */ * in a file named after the word itself */
sprintf(pathString, "lexicon/%s", RIVout.name); sprintf(pathString, "lexicon/%s", RIVout.name);
FILE *lexWord = fopen(pathString, "wb"); FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){ if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString); printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1; return 1;
} }
fwrite(RIVout.frequency, 1, 4, lexWord); fwrite(RIVout.frequency, 1, 4, lexWord);
fwrite(&RIVout.magnitude, 1, 4, lexWord); fwrite(&RIVout.magnitude, 1, 4, lexWord);
fwrite(RIVout.values, RIVSIZE, 4, lexWord); fwrite(RIVout.values, RIVSIZE, 4, lexWord);
fclose(lexWord); fclose(lexWord);
free(RIVout.values); free(RIVout.values);
return 0; return 0;
} }
denseRIV fLexPull(FILE* lexWord){ denseRIV fLexPull(FILE* lexWord){
denseRIV output; denseRIV output;
output.values = malloc( (RIVSIZE+1) *sizeof(int)); output.values = malloc( (RIVSIZE+1) *sizeof(int));
output.frequency = (unsigned int*)(output.values+RIVSIZE); output.frequency = (unsigned int*)(output.values+RIVSIZE);
int diagnostic = 0; int diagnostic = 0;
diagnostic += fread(output.frequency, 1, sizeof(int), lexWord); diagnostic += fread(output.frequency, 1, sizeof(int), lexWord);
diagnostic += fread(&(output.magnitude), 1, sizeof(int), lexWord); diagnostic += fread(&(output.magnitude), 1, sizeof(int), lexWord);
diagnostic += fread(output.values, RIVSIZE, sizeof(int), lexWord); diagnostic += fread(output.values, RIVSIZE, sizeof(int), lexWord);
if(diagnostic != (RIVSIZE+2)){ if(diagnostic != (RIVSIZE+2)){
output.magnitude = -1; output.magnitude = -1;
} }
output.cached = 0; output.cached = 0;
return output; output.flags &= ~SPARSE;
return output;
}
}
void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){ void signalSecure(int signum, siginfo_t *si, void* arg){
puts("cache dump failed, some lexicon data lost"); if(cacheDump()){
}else{ puts("cache dump failed, some lexicon data lost");
puts("cache dumped successfully"); }else{
} puts("cache dumped successfully");
signal(signum, SIG_DFL); }
kill(getpid(), signum); signal(signum, SIG_DFL);
} kill(getpid(), signum);
}
int cacheDump(){
int flag = 0; int cacheDump(){
denseRIV* cache_slider = RIVKey.RIVCache; int flag = 0;
denseRIV* cache_stop = RIVKey.RIVCache+RIVKey.cacheSize; denseRIV* cache_slider = RIVKey.RIVCache;
while(cache_slider<cache_stop){ denseRIV* cache_stop = RIVKey.RIVCache+RIVKey.cacheSize;
if((*cache_slider).cached){ while(cache_slider<cache_stop){
flag += fLexPush(*cache_slider); if((*cache_slider).cached){
} flag += fLexPush(*cache_slider);
cache_slider++; }
} cache_slider++;
return flag; }
} return flag;
}
denseRIV denseAllocate(){
/* allocates a 0 vector */ denseRIV denseAllocate(){
denseRIV output; /* allocates a 0 vector */
output.values = calloc(RIVSIZE+1, sizeof(int)); denseRIV output;
/* for compact memory use, frequency is placed immediately after values */ output.values = calloc(RIVSIZE+1, sizeof(int));
output.frequency = (unsigned int*)(output.values+RIVSIZE); /* for compact memory use, frequency is placed immediately after values */
output.magnitude = 0; output.frequency = (unsigned int*)(output.values+RIVSIZE);
output.cached = 0; output.magnitude = 0;
return output; output.cached = 0;
} output.flags &= ~SPARSE;
return output;
/*TODO add a simplified free function*/ }
/*TODO add a simplified free function*/
No preview for this file type
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <dirent.h> #include <dirent.h>
#include <time.h> #include <time.h>
#define RIVSIZE 5 #define RIVSIZE 5
#define CACHESIZE 0 #define CACHESIZE 0
#define THRESHOLD 0.70 #define THRESHOLD 0.70
#include "RIVtoolsCPUlinux.h" #include "RIVtoolsCPUlinux.h"
void getcentroids(sparseRIV* centroids, sparseRIV* vectorSet, int centroidCount, int vectorCount); void getcentroids(sparseRIV* centroids, sparseRIV* vectorSet, int centroidCount, int vectorCount);
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount); void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
clock_t begintotal = clock(); clock_t begintotal = clock();
int fileCount = 0; int fileCount = 0;
RIVInit(); RIVInit();
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV)); sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000]; char rootString[2000];
if(argc <2){ if(argc <2){
printf("give me a directory"); printf("give me a directory");
return 1; return 1;
} }
strcpy(rootString, argv[1]); strcpy(rootString, argv[1]);
strcat(rootString, "/"); strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount); directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount); printf("fileCount: %d\n", fileCount);
getMagnitudes(fileRIVs, fileCount); getMagnitudes(fileRIVs, fileCount);
clock_t beginnsquared = clock(); clock_t beginnsquared = clock();
sparseRIV centroids[5]; sparseRIV centroids[5];
strcpy(centroids[0].name, "boobs"); strcpy(centroids[0].name, "boobs");
strcpy(centroids[1].name, "ass"); strcpy(centroids[1].name, "ass");
strcpy(centroids[2].name, "shit"); strcpy(centroids[2].name, "shit");
strcpy(centroids[3].name, "cocks"); strcpy(centroids[3].name, "cocks");
strcpy(centroids[4].name, "fuck"); strcpy(centroids[4].name, "fuck");
for(int i=0; i<5; i++){ for(int i=0; i<5; i++){
centroids[i] = wordtoL2(centroids[i].name); centroids[i] = wordtoL2(centroids[i].name);
} }
getMagnitudes(centroids, 5); getMagnitudes(centroids, 5);
getcentroids(centroids, fileRIVs, 5, fileCount); getcentroids(centroids, fileRIVs, 5, fileCount);
clock_t endnsquared = clock(); clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC; double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("nsquared time:%lf\n\n", time); printf("nsquared time:%lf\n\n", time);
printf("%d <", RIVKey.thing); printf("%d <", RIVKey.thing);
clock_t endtotal = clock(); clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC; double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent); printf("total time:%lf\n\n", time_spent);
free(fileRIVs); free(fileRIVs);
return 0; return 0;
} }
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){ void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000]; char pathString[2000];
DIR *directory; DIR *directory;
struct dirent *files = 0; struct dirent *files = 0;
if(!(directory = opendir(rootString))){ if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString); printf("location not found, %s\n", rootString);
return; return;
} }
while((files=readdir(directory))){ while((files=readdir(directory))){
if(*(files->d_name) == '.') continue; if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){ if(files->d_type == DT_DIR){
strcpy(pathString, rootString); strcpy(pathString, rootString);
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
strcat(pathString, "/"); strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount); directoryToL2s(pathString, fileRIVs, fileCount);
} }
strcpy(pathString, rootString); strcpy(pathString, rootString);
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r"); FILE *input = fopen(pathString, "r");
if(!input){ if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString); printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return; return;
}else{ }else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV)); (*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2Clean(input); (*fileRIVs)[(*fileCount)] = fileToL2Clean(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString); strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input); fclose(input);
(*fileCount)++; (*fileCount)++;
} }
} }
} }
void getcentroids(sparseRIV* centroids, sparseRIV* vectorSet, int centroidCount, int vectorCount){ void getcentroids(sparseRIV* centroids, sparseRIV* vectorSet, int centroidCount, int vectorCount){
float** cosines = malloc(centroidCount*sizeof(int*)); float** cosines = malloc(centroidCount*sizeof(int*));
for(int i=0; i<centroidCount; i++){ for(int i=0; i<centroidCount; i++){
cosines[i] = cosineCompare(centroids[i], vectorSet, vectorCount); cosines[i] = cosineCompare(centroids[i], vectorSet, vectorCount);
} }
int* centroidIndexes[centroidCount]; int* centroidIndexes[centroidCount];
int indexCounts[centroidCount]; int indexCounts[centroidCount];
int* denses[centroidCount]; int* denses[centroidCount];
*centroidIndexes = calloc(vectorCount*centroidCount, sizeof(int)); *centroidIndexes = calloc(vectorCount*centroidCount, sizeof(int));
*denses = malloc(RIVKey.RIVsize*centroidCount * sizeof(int)); *denses = malloc(RIVKey.RIVsize*centroidCount * sizeof(int));
for(int i=1; i<centroidCount; i++){ for(int i=1; i<centroidCount; i++){
centroidIndexes[i] = centroidIndexes[0]+i*vectorCount; centroidIndexes[i] = centroidIndexes[0]+i*vectorCount;
denses[i] = denses[0] +i*RIVKey.RIVsize; denses[i] = denses[0] +i*RIVKey.RIVsize;
} }
float token = 2.0; float token = 2.0;
int counter = 0; int counter = 0;
for(int i=0; i<vectorCount; i++){ for(int i=0; i<vectorCount; i++){
token = 2.0; token = 2.0;
printf("\nfor vector %d:\n", i); printf("\nfor vector %d:\n", i);
for(int j = 0; j<centroidCount; j++){ for(int j = 0; j<centroidCount; j++){
printf("centroid %d: %f", j, cosines[j][i]); printf("centroid %d: %f", j, cosines[j][i]);
if(fabsf(cosines[j][i])< token){ if(fabsf(cosines[j][i])< token){
token = fabsf(cosines[j][i]); token = fabsf(cosines[j][i]);
counter = j; counter = j;
} }
} }
centroidIndexes[counter][indexCounts[counter]] = i; centroidIndexes[counter][indexCounts[counter]] = i;
indexCounts[counter] += 1; indexCounts[counter] += 1;
} }
for(int i=0; i<centroidCount; i++){ for(int i=0; i<centroidCount; i++){
memset(denses[i], 0, RIVKey.RIVsize); memset(denses[i], 0, RIVKey.RIVsize);
printf("\n\nnumber %d\n", i); printf("\n\nnumber %d\n", i);
for(int j=0; j<indexCounts[i]; i++){ for(int j=0; j<indexCounts[i]; i++){
addS2D(denses[i], vectorSet[j]); addS2D(denses[i], vectorSet[j]);
for(int k=0; k<RIVKey.RIVsize; k++){ for(int k=0; k<RIVKey.RIVsize; k++){
printf("%d, ", denses[i][k]); printf("%d, ", denses[i][k]);
} }
} }
} }
} }
No preview for this file type
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <dirent.h> #include <dirent.h>
#include <time.h> #include <time.h>
#define RIVSIZE 5000 #define RIVSIZE 5000
#define CACHESIZE 0 #define CACHESIZE 0
#define NONZEROS 2 #define NONZEROS 2
#define THRESHOLD 0.7 #define THRESHOLD 0.7
#define COSINEACTION do {\ #define COSINEACTION do {\
if(cosine > THRESHOLD){ \ if(cosine > THRESHOLD){ \
printf("%s\t%s\n%f\n", baseRIV.name, (*multipliers).name, cosine);\ printf("%s\t%s\n%f\n", baseRIV.name, (*multipliers).name, cosine);\
(*multipliers).boolean = 0; \ (*multipliers).boolean = 0; \
RIVKey.thing++; \ RIVKey.thing++; \
}\ }\
}while(0) }while(0)
#include "RIVtoolsMorphic.h" #include "RIVtoolsMorphic.h"
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount); void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
clock_t begintotal = clock(); clock_t begintotal = clock();
int fileCount = 0; int fileCount = 0;
RIVInit(); RIVInit();
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV)); sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000]; char rootString[2000];
if(argc <2){ if(argc <2){
printf("give me a directory"); printf("give me a directory");
return 1; return 1;
} }
strcpy(rootString, argv[1]); strcpy(rootString, argv[1]);
strcat(rootString, "/"); strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount); directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount); printf("fileCount: %d\n", fileCount);
sparseRIV* fileRIVs_slider = fileRIVs; sparseRIV* fileRIVs_slider = fileRIVs;
sparseRIV* fileRIVs_stop = fileRIVs+fileCount; sparseRIV* fileRIVs_stop = fileRIVs+fileCount;
while(fileRIVs_slider <fileRIVs_stop){ while(fileRIVs_slider <fileRIVs_stop){
(*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider); (*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider);
fileRIVs_slider++; fileRIVs_slider++;
} }
clock_t beginnsquared = clock(); clock_t beginnsquared = clock();
float cosine; float cosine;
float minmag; float minmag;
float maxmag; float maxmag;
denseRIV baseDense; denseRIV baseDense;
baseDense.values = malloc(RIVSIZE*sizeof(int)); baseDense.values = malloc(RIVSIZE*sizeof(int));
fileRIVs_slider = fileRIVs; fileRIVs_slider = fileRIVs;
sparseRIV* comparators_slider; sparseRIV* comparators_slider;
while(fileRIVs_slider<fileRIVs_stop){ while(fileRIVs_slider<fileRIVs_stop){
comparators_slider = fileRIVs; comparators_slider = fileRIVs;
memset(baseDense.values, 0, RIVSIZE*sizeof(int)); memset(baseDense.values, 0, RIVSIZE*sizeof(int));
baseDense.values = addS2D(baseDense.values, *fileRIVs_slider); baseDense.values = addS2D(baseDense.values, *fileRIVs_slider);
baseDense.magnitude = (*fileRIVs_slider).magnitude; baseDense.magnitude = (*fileRIVs_slider).magnitude;
minmag = baseDense.magnitude*.85; minmag = baseDense.magnitude*.85;
maxmag = baseDense.magnitude*1.15; maxmag = baseDense.magnitude*1.15;
while(comparators_slider < fileRIVs_slider){ while(comparators_slider < fileRIVs_slider){
if((*comparators_slider).magnitude < maxmag && (*comparators_slider).magnitude > minmag && (*comparators_slider).boolean){ if((*comparators_slider).magnitude < maxmag && (*comparators_slider).magnitude > minmag && (*comparators_slider).boolean){
cosine = cosCompare(baseDense, *comparators_slider); cosine = cosCompare(baseDense, *comparators_slider);
if(cosine>THRESHOLD){ if(cosine>THRESHOLD){
printf("%s\t%s\n%f\n", (*fileRIVs_slider).name , (*comparators_slider).name, cosine); printf("%s\t%s\n%f\n", (*fileRIVs_slider).name , (*comparators_slider).name, cosine);
(*comparators_slider).boolean = 0; (*comparators_slider).boolean = 0;
RIVKey.thing++; RIVKey.thing++;
} }
} }
comparators_slider++; comparators_slider++;
//cosineCompare(fileRIVs[i], fileRIVs, i); //cosineCompare(fileRIVs[i], fileRIVs, i);
} }
fileRIVs_slider++; fileRIVs_slider++;
} }
clock_t endnsquared = clock(); clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC; double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("nsquared time:%lf\n\n", time); printf("nsquared time:%lf\n\n", time);
printf("%d <", RIVKey.thing); printf("%d <", RIVKey.thing);
clock_t endtotal = clock(); clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC; double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent); printf("total time:%lf\n\n", time_spent);
free(fileRIVs); free(fileRIVs);
return 0; return 0;
} }
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){ void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000]; char pathString[2000];
DIR *directory; DIR *directory;
struct dirent *files = 0; struct dirent *files = 0;
if(!(directory = opendir(rootString))){ if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString); printf("location not found, %s\n", rootString);
return; return;
} }
while((files=readdir(directory))){ while((files=readdir(directory))){
if(*(files->d_name) == '.') continue; if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){ if(files->d_type == DT_DIR){
strcpy(pathString, rootString); strcpy(pathString, rootString);
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
strcat(pathString, "/"); strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount); directoryToL2s(pathString, fileRIVs, fileCount);
} }
strcpy(pathString, rootString); strcpy(pathString, rootString);
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r"); FILE *input = fopen(pathString, "r");
if(!input){ if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString); printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return; return;
}else{ }else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV)); (*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2(input); (*fileRIVs)[(*fileCount)] = fileToL2(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString); strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input); fclose(input);
(*fileCount)++; (*fileCount)++;
} }
} }
} }
No preview for this file type
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h> #include <time.h>
#define CACHESIZE 100000 #define CACHESIZE 100000
#include "RIVtoolsCPUlinux.h" #include "RIVtoolsCPUlinux.h"
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include <unistd.h> #include <unistd.h>
#include <dirent.h> #include <dirent.h>
#include <error.h> #include <error.h>
void fileGrind(FILE* textFile); void fileGrind(FILE* textFile);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount); void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount); int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void directoryGrind(char *rootString); void directoryGrind(char *rootString);
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
clock_t begintotal = clock(); clock_t begintotal = clock();
RIVInit(); RIVInit();
char pathString[1000]; char pathString[1000];
strcpy(pathString, argv[1]); strcpy(pathString, argv[1]);
strcat(pathString, "/"); strcat(pathString, "/");
directoryGrind(pathString); directoryGrind(pathString);
clock_t endtotal = clock(); clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC; double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent); printf("total time:%lf\n\n", time_spent);
RIVCleanup(); RIVCleanup();
return 0; return 0;
} }
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){ void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
denseRIV *denseSet_slider; denseRIV *denseSet_slider;
denseRIV *dense_stop = denseSet+RIVCount; denseRIV *dense_stop = denseSet+RIVCount;
int *locations = additive.locations; int *locations = additive.locations;
int *locations_stop = locations+additive.count; int *locations_stop = locations+additive.count;
int *values = additive.values; int *values = additive.values;
//int *target; //int *target;
while(locations<locations_stop){ while(locations<locations_stop){
denseSet_slider = denseSet; denseSet_slider = denseSet;
while(denseSet_slider<dense_stop){ while(denseSet_slider<dense_stop){
(*denseSet_slider).values[*locations]+= *values; (*denseSet_slider).values[*locations]+= *values;
//*target+=*values; //*target+=*values;
denseSet_slider++; denseSet_slider++;
} }
locations++; locations++;
values++; values++;
} }
} }
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){ int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
denseRIV* RIVStop = RIVSet+wordCount; denseRIV* RIVStop = RIVSet+wordCount;
while(RIVSet<RIVStop){ while(RIVSet<RIVStop){
if(!strcmp(word, RIVSet->name)){ if(!strcmp(word, RIVSet->name)){
return 1; return 1;
} }
RIVSet++; RIVSet++;
} }
return 0; return 0;
} }
void directoryGrind(char *rootString){ void directoryGrind(char *rootString){
char pathString[2000]; char pathString[2000];
DIR *directory; DIR *directory;
struct dirent *files = 0; struct dirent *files = 0;
if(!(directory = opendir(rootString))){ if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString); printf("location not found, %s\n", rootString);
return; return;
} }
while((files=readdir(directory))){ while((files=readdir(directory))){
while(!strcmp(files->d_name, ".") || !strcmp(files->d_name, "..")){ while(!strcmp(files->d_name, ".") || !strcmp(files->d_name, "..")){
files = readdir(directory); files = readdir(directory);
} }
if(files->d_type == DT_DIR){ if(files->d_type == DT_DIR){
strcpy(pathString, rootString); strcpy(pathString, rootString);
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
strcat(pathString, "/"); strcat(pathString, "/");
directoryGrind(pathString); directoryGrind(pathString);
} }
strcpy(pathString, rootString); strcpy(pathString, rootString);
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
printf("%s\n", pathString); printf("%s\n", pathString);
FILE *input = fopen(pathString, "r+"); FILE *input = fopen(pathString, "r+");
if(input){ if(input){
fileGrind(input); fileGrind(input);
fclose(input); fclose(input);
} }
} }
} }
void fileGrind(FILE* textFile){ void fileGrind(FILE* textFile){
sparseRIV aggregateRIV = fileToL2Clean(textFile); sparseRIV aggregateRIV = fileToL2Clean(textFile);
fseek(textFile, 0, SEEK_SET); fseek(textFile, 0, SEEK_SET);
int wordCount = 0; int wordCount = 0;
denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV)); denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
char word[200]; char word[200];
while(fscanf(textFile, "%99s", word)){ while(fscanf(textFile, "%99s", word)){
if(feof(textFile)) break; if(feof(textFile)) break;
if(!(*word))continue; if(!(*word))continue;
if(!isWordClean((char*)word)){ if(!isWordClean((char*)word)){
continue; continue;
} }
if(checkDupe(RIVArray, word, wordCount)){ if(checkDupe(RIVArray, word, wordCount)){
continue; continue;
} }
RIVArray[wordCount] = lexPull(word); RIVArray[wordCount] = lexPull(word);
if(!*((RIVArray[wordCount].name))) break; if(!*((RIVArray[wordCount].name))) break;
int* thing = RIVArray[wordCount].frequency; int* thing = RIVArray[wordCount].frequency;
*thing = *thing + 1; *thing = *thing + 1;
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing); //printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
wordCount++; wordCount++;
} }
//printf("%d\n", wordCount); //printf("%d\n", wordCount);
addS2Ds(RIVArray, aggregateRIV, wordCount); addS2Ds(RIVArray, aggregateRIV, wordCount);
denseRIV* RIVArray_slider = RIVArray; denseRIV* RIVArray_slider = RIVArray;
denseRIV* RIVArray_stop = RIVArray+wordCount; denseRIV* RIVArray_stop = RIVArray+wordCount;
while(RIVArray_slider<RIVArray_stop){ while(RIVArray_slider<RIVArray_stop){
lexPush(*RIVArray_slider); lexPush(*RIVArray_slider);
RIVArray_slider++; RIVArray_slider++;
} }
free(RIVArray); free(RIVArray);
free(aggregateRIV.locations); free(aggregateRIV.locations);
//free(aggregateRIV.values); //free(aggregateRIV.values);
} }
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <math.h> #include <math.h>
#include "RIVLower.h" #include "RIVLower.h"
#include "RIVaccessories.h" #include "RIVaccessories.h"
/* lexPush writes a denseRIV to a file for permanent storage */ /* RIV stands for Random Index Vector, referring to the method of generating
int lexPush(denseRIV RIVout); * the basic vectors that correspond to each word. each word has an algorithmically
/* lexPull reads an existing lexicon entry (under directory "lexicon") * generated vector which represents it in this mathematical model, such that a word
* and creates a denseRIV with those attributes. * will produce the same vector each time it is encountered*[1]. this base
* if the file does not exist, it creates a 0 vector with the name of word * vector will be referred to as a L1 vector or a barcode vector
*/ *
denseRIV lexPull(char* word); * by summing these vectors, we can get a mathematical representation of
* a set of text. this summed vector will be referred to as an L2 vector
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n") * or aggregate vector. in its simplest implimentation, an L2 vector
* and returns a sparse RIV which is the vector sum of the base RIVs of each * representation of a document contains a model of the contents of the
* word contained * document, enabling us to compare direction and magnitude of document
*/ * vectors to understand their relationships to each other.
sparseRIV fileToL2(FILE *input); *
/* fileToL2Clean operates the same as fileToL2 butkeeps only words * but the system we are really interested in is the ability to form
* containing lowercase letters and the '_' symbol * context vectors
* this is important if you will be lexPush-ing those words later * a context vector is the sum of all (L1?) vectors that the word
*/ * has been encountered in context with. from these context vectors
sparseRIV fileToL2Clean(FILE *data); * certain patterns and relationships between words should emerge.
* what patterns? that is the key question we will try to answer
sparseRIV fileToL2direct(FILE *data); *
/*cosine determines the "similarity" between two RIVs. */ * [1] a word produces the same vector each time it is encountered only
float cosCompare(denseRIV baseRIV, sparseRIV comparator); * if the environment is the same, ie. RIVs are the same dimensionality
* nonzero count is the same. comparing vectors produced in different
sparseRIV wordtoL2(char* word); * environments yields meaningless drivel and should be avoided
*
sparseRIV consolidateI2S(int *implicit, size_t valueCount); * [2] what exactly "context" means remains a major stumbling point.
sparseRIV text2L2(char *text); * paragraphs? sentences? some potential analyses would expect a static
sparseRIV text2L2(char *text){ * sized context (the nearest 10 words?) in order to be sensible, but
unsigned int blockSize; * it may be that some other definition of context is the most valid for
char word[100] = {0}; * this model. we will have to find out.
*
/* locations (implicit RIV) are temp stored in temp block, and moved * some notes:
* to permanent home in consolidation */ *
int *locations = RIVKey.h_tempBlock; * -sparseRIV vs. denseRIV (sparse vector vs. dense vector)
int locationCount = 0; * the two primary data structures we will use to analyze these vectors
int displacement; * each vector type is packed with some metadata
* (name, magnitude, frequency, flags)
while(sscanf(text, "%99s%n", word, &displacement)){ *
text += displacement+1; * -denseRIV is a standard vector representation.
if(!displacement){ * each array index corresponds to a dimension
break; * each value corresponds to a measurement in that dimension
} *
* -sparseRIV is vector representation optimized for largely empty vectors
if(!(*word)){ * each data point is a location/value pair where the
break; * location represents array index
} * value represents value in that array index
*
blockSize = locationCount+NONZEROS; * if we have a sparsely populated dense vector (mostly 0s) such as:
/* if this word would overflow the locations block, grow it */ *
if(blockSize>RIVKey.tempSize){ * |0|0|5|0|0|0|0|0|4|0|
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int)); *
locations = RIVKey.h_tempBlock; * there are only 2 values in a ten element array. this could, instead
RIVKey.tempSize+=NONZEROS; * be represented as
} *
* |2|8| array indexes
/* add word's L1 RIV to the accumulating implicit RIV */ * |5|4| array values
makeSparseLocations((unsigned char*)word, locations, locationCount); * |2| record of size
locationCount+= NONZEROS; *
* and so, a 10 element vector has been represented in only 5 integers
} *
sparseRIV output = consolidateI2S(locations, locationCount); * this is important for memory use, of course, but also for rapid calculations
* if we have two vectors
/* frequency records the number of words in this file */ *
output.frequency = locationCount/NONZEROS; * |0|0|5|0|0|0|0|0|4|0|
output.boolean = 1; * |0|0|0|0|0|0|7|0|3|-2|
return output; * and we wish to perform the dot product this will take 10 steps,
} * 9 of which are either 0*0 = 0, or 0*x = 0
* if we instead have these represented as sparse vectors
sparseRIV fileToL2(FILE *data){ * |2|8|
unsigned int blockSize; * |5|4|
unsigned char word[100] = {0}; * |2|
*
/* locations (implicit RIV) are temp stored in temp block, and moved * |6|8|9|
* to permanent home in consolidation */ * |7|3|-2|
int *locations = RIVKey.h_tempBlock; * |3|
int locationCount = 0; *
* we only need to search for matching location values
* or, better yet, if we use a hybrid analysis:
while(fscanf(data, "%99s", word)){ * |0|0|5|0|0|0|0|0|4|0|
* ___________/__/_/
if(feof(data)){ * / / /
break; * |6|8|9|
} * |7|3|-2|
if(!(*word)){ * |3|
break; * we can simply access the dense vector by indexes held in the sparse vector
} * reducing this operation to only 3 steps
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){ /* lexPush writes a denseRIV to a file for permanent storage */
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int)); int lexPush(denseRIV RIVout);
locations = RIVKey.h_tempBlock; /* lexPull reads an existing lexicon entry (under directory "lexicon")
RIVKey.tempSize+=NONZEROS; * and creates a denseRIV with those attributes.
} * if the file does not exist, it creates a 0 vector with the name of word
*/
/* add word's L1 RIV to the accumulating implicit RIV */ denseRIV lexPull(char* word);
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS; /* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each
} * word contained
*/
sparseRIV output = consolidateI2S(locations, locationCount); sparseRIV fileToL2(FILE *input);
/* frequency records the number of words in this file */ /* fileToL2Clean operates the same as fileToL2 butkeeps only words
output.frequency = locationCount/NONZEROS; * containing lowercase letters and the '_' symbol
output.boolean = 1; * this is important if you will be lexPush-ing those words later
*/
return output; sparseRIV fileToL2Clean(FILE *data);
}
/*filetoL2direct is an experiment in simplifying the process. it's slow */
sparseRIV fileToL2Clean(FILE *data){ sparseRIV fileToL2direct(FILE *data);
/*cosine determines the "similarity" between two RIVs. */
unsigned char word[100] = {0}; float cosCompare(denseRIV baseRIV, sparseRIV comparator);
int *locations = RIVKey.h_tempBlock;
unsigned int blockSize; /*currently unused */
sparseRIV wordtoL2(char* word);
int locationCount = 0;
/* converts an implicit RIV (a set of unvalued locations) into a formal
while(fscanf(data, "%99s", word)){ * sparse RIV. this chooses the best method to perform the consolidation
* and launches that function */
if(feof(data)){ sparseRIV consolidateI2S(int *implicit, size_t valueCount);
break;
} /* like fileToL2 but takes a block of text */
sparseRIV text2L2(char *text);
if(!(*word)){ sparseRIV text2L2(char *text){
break; unsigned int blockSize;
} char word[100] = {0};
/* if the word is not clean, skip it */
if(!isWordClean((char*)word)){ /* locations (implicit RIV) are temp stored in temp block, and moved
continue; * to permanent home in consolidation */
} int *locations = RIVKey.h_tempBlock;
blockSize = locationCount+NONZEROS; int locationCount = 0;
if(blockSize>RIVKey.tempSize){ int displacement;
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock; while(sscanf(text, "%99s%n", word, &displacement)){
RIVKey.tempSize+=NONZEROS; text += displacement+1;
} if(!displacement){
break;
makeSparseLocations(word, locations, locationCount); }
locationCount+= NONZEROS;
if(!(*word)){
} break;
}
sparseRIV output = consolidateI2S(locations, locationCount);
blockSize = locationCount+NONZEROS;
/* frequency records the number of words in this file */ /* if this word would overflow the locations block, grow it */
output.frequency = locationCount/NONZEROS; if(blockSize>RIVKey.tempSize){
output.boolean = 1; RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
return output; locations = RIVKey.h_tempBlock;
} RIVKey.tempSize+=NONZEROS;
}
sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount>RIVKey.I2SThreshold){ /* add word's L1 RIV to the accumulating implicit RIV */
return consolidateI2SIndirect(implicit, valueCount); makeSparseLocations((unsigned char*)word, locations, locationCount);
}else{ locationCount+= NONZEROS;
return consolidateI2SDirect(implicit, valueCount);
} }
sparseRIV output = consolidateI2S(locations, locationCount);
}
void aggregateWord2D(denseRIV destination, char* word){ /* frequency records the number of words in this file, untill frequency
* is needed to hold some more useful data point */
//makeSparseLocations((unsigned char*)word, locationSlot, 0); output.frequency = locationCount/NONZEROS;
srand(wordtoSeed((unsigned char*)word)); output.boolean = 1;
for(int i=0; i<NONZEROS; i++){ return output;
}
destination.values[(rand()%RIVSIZE)] +=1;
destination.values[(rand()%RIVSIZE)] -= 1; sparseRIV fileToL2(FILE *data){
} unsigned int blockSize;
} unsigned char word[100] = {0};
float cosCompare(denseRIV baseRIV, sparseRIV comparator){ /* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int dot = 0; int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
int *values = comparator.values;
int *locations = comparator.locations;
int *locations_Stop = locations+comparator.count; while(fscanf(data, "%99s", word)){
while(locations<locations_Stop){ if(feof(data)){
/* we calculate the dot-product to derive the cosine */ break;
dot += (*values)*(*(baseRIV.values+(*locations))); }
locations++; if(!(*word)){
values++; break;
} }
float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
blockSize = locationCount+NONZEROS;
return cosine; /* if this word would overflow the locations block, grow it */
} if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
float getMagnitudeSparse(sparseRIV input){ locations = RIVKey.h_tempBlock;
unsigned long long int temp = 0; RIVKey.tempSize+=NONZEROS;
int *values = input.values; }
int *values_stop = values+input.count;
while(values<values_stop){ /* add word's L1 RIV to the accumulating implicit RIV */
temp += (*values)*(*values); makeSparseLocations(word, locations, locationCount);
values++; locationCount+= NONZEROS;
} }
float magnitude = sqrt(temp);
input.magnitude = magnitude; sparseRIV output = consolidateI2S(locations, locationCount);
return magnitude;
} /* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS;
denseRIV lexPull(char* word){ output.boolean = 1;
#if CACHESIZE > 0
return output;
/* if there is a cache, first check if the word is cached */ }
srand(wordtoSeed((unsigned char*)word));
int hash = rand()%CACHESIZE; sparseRIV fileToL2Clean(FILE *data){
if(!strcmp(word, RIVKey.RIVCache[hash].name)){
/* if word is cached, pull from cache and exit */ unsigned char word[100] = {0};
return RIVKey.RIVCache[hash]; int *locations = RIVKey.h_tempBlock;
} unsigned int blockSize;
#endif /* CACHESIZE > 0 */
denseRIV output; int locationCount = 0;
while(fscanf(data, "%99s", word)){
char pathString[200]; if(feof(data)){
break;
sprintf(pathString, "lexicon/%s", word); }
FILE *lexWord = fopen(pathString, "rb");
if(!(*word)){
/* if this lexicon file already exists */ break;
if(lexWord){ }
/* pull data from file */ /* if the word is not clean, skip it */
output = fLexPull(lexWord); if(!isWordClean((char*)word)){
fclose(lexWord); continue;
}else{ }
/*if file does not exist, return a 0 vector */ blockSize = locationCount+NONZEROS;
output = denseAllocate(); if(blockSize>RIVKey.tempSize){
} RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
strcpy(output.name, word); RIVKey.tempSize+=NONZEROS;
return output; }
}
int lexPush(denseRIV RIVout){ makeSparseLocations(word, locations, locationCount);
//printf("%s\n", (*RIVout).name); locationCount+= NONZEROS;
#if CACHESIZE == 0
fLexPush(RIVout); }
return 0;
#else /* CACHESIZE != 0 */ sparseRIV output = consolidateI2S(locations, locationCount);
/* if our RIV was cached, there are two options (hopefully) /* frequency records the number of words in this file */
* either the RIV is still cached, and the data has been updated to the cache output.frequency = locationCount/NONZEROS;
* or the RIV was pushed out from under it, in which case it has already been pushed*/ output.boolean = 1;
return output;
if(RIVout.cached){ }
return 0;
} sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount<RIVKey.I2SThreshold){
srand(wordtoSeed((unsigned char*)RIVout.name)); /* direct method is faster on small datasets, but has geometric scaling on large datasets */
int hash = rand()%CACHESIZE; return consolidateI2SDirect(implicit, valueCount);
}else{
if(!RIVKey.RIVCache[hash].cached){ /* optimized for large datasets */
RIVKey.RIVCache[hash] = RIVout; return consolidateI2SIndirect(implicit, valueCount);
RIVKey.RIVCache[hash].cached = 1; }
return 0;
}
/*if the current RIV is more frequent than the RIV holding it's slot */ void aggregateWord2D(denseRIV destination, char* word){
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
//scanf("%f", &(*RIVout).magnitude);
//printf("%s replacing %s\n", (*RIVout).name, RIVKey.RIVCache[hash].name); srand(wordtoSeed((unsigned char*)word));
/* push the current cache entry to a file */ for(int i=0; i<NONZEROS; i++){
int diag = fLexPush(RIVKey.RIVCache[hash]);
/* replace the cache entry with the currrent RIV */ destination.values[(rand()%RIVSIZE)] +=1;
destination.values[(rand()%RIVSIZE)] -= 1;
RIVKey.RIVCache[hash] = RIVout; }
RIVKey.RIVCache[hash].cached = 1; }
return diag;
}else{ float cosCompare(denseRIV baseRIV, sparseRIV comparator){
/* push current RIV to file */
fLexPush(RIVout); int dot = 0;
}
return 0; int *values = comparator.values;
#endif /* CACHESIZE == 0 */ int *locations = comparator.locations;
} int *locations_Stop = locations+comparator.count;
sparseRIV fileToL2direct(FILE *data){; while(locations<locations_Stop){
unsigned char word[100] = {0}; /* we calculate the dot-product to derive the cosine
denseRIV denseTemp; * comparing sparse to dense by index*/
// a temporary dense RIV is stored in the tempBlock dot += (*values)*(*(baseRIV.values+(*locations)));
denseTemp.values = RIVKey.h_tempBlock; locations++;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int)); values++;
int count = 0; }
while(fscanf(data, "%99s", word)){ /*dot divided by product of magnitudes */
count++; float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
if(feof(data)){
break; return cosine;
} }
if(!(*word)){
break; float getMagnitudeSparse(sparseRIV input){
} unsigned long long int temp = 0;
int *values = input.values;
int *values_stop = values+input.count;
// add word's L1 RIV to the accumulating implicit RIV while(values<values_stop){
aggregateWord2D(denseTemp, (char*)word); temp += (*values)*(*values);
values++;
}
sparseRIV output = consolidateD2S(denseTemp.values); }
float magnitude = sqrt(temp);
// frequency records the number of words in this file input.magnitude = magnitude;
output.frequency = count; return magnitude;
output.boolean = 1; }
return output;
} denseRIV lexPull(char* word){
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
srand(wordtoSeed((unsigned char*)word));
int hash = rand()%CACHESIZE;
if(!strcmp(word, RIVKey.RIVCache[hash].name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
#endif /* CACHESIZE > 0 */
/* if not, attempt to pull the word data from lexicon file */
denseRIV output;
char pathString[200];
sprintf(pathString, "lexicon/%s", word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons?
output = denseAllocate();
}
strcpy(output.name, word);
return output;
}
int lexPush(denseRIV RIVout){
#if CACHESIZE == 0
/* if there is no cache, simply push to file */
fLexPush(RIVout);
return 0;
#else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated
* to the cache or the RIV was pushed out from under it,
* in which case it has already been pushed! move on*/
if(RIVout.cached){
return 0;
}
srand(wordtoSeed((unsigned char*)RIVout.name));
int hash = rand()%CACHESIZE;
if(!RIVKey.RIVCache[hash].cached){
/* if there is no word in this cache slot, push to cache instead of file */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return 0;
/*if the current RIV is more frequent than the RIV holding its slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
/* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]);
/* push the current RIV to cache */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return diag;
}else{
/* push current RIV to file */
fLexPush(RIVout);
}
return 0;
#endif /* CACHESIZE == 0 */
}
sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0};
denseRIV denseTemp;
// a temporary dense RIV is stored in the tempBlock
denseTemp.values = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
int count = 0;
while(fscanf(data, "%99s", word)){
count++;
if(feof(data)){
break;
}
if(!(*word)){
break;
}
// add word's L1 RIV to the accumulating implicit RIV
aggregateWord2D(denseTemp, (char*)word);
}
sparseRIV output = consolidateD2S(denseTemp.values);
// frequency records the number of words in this file
output.frequency = count;
output.boolean = 1;
return output;
}
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <strsafe.h> #include <strsafe.h>
#define SEEDMASK 25214903917 #define SEEDMASK 25214903917
struct RIVData{ struct RIVData{
int RIVsize; int RIVsize;
int nonZeros; int nonZeros;
long long int *masks; long long int *masks;
int *h_tempBlock; int *h_tempBlock;
int *h_stagingBlock; int *h_stagingBlock;
int *h_staging_slider; int *h_staging_slider;
int *h_staging_stop; int *h_staging_stop;
int *h_displacements; int *h_displacements;
int *d_OpenSlot; int *d_OpenSlot;
int *d_SlotEnd; int *d_SlotEnd;
float *d_magnitudes; float *d_magnitudes;
int thing; int thing;
}RIVKeyData; }RIVKeyData;
typedef struct{ typedef struct{
char name[100]; char name[100];
int *values; int *values;
int *locations; int *locations;
int count; int count;
int frequency; int frequency;
float magnitude; float magnitude;
int boolean; int boolean;
}sparseRIV; }sparseRIV;
sparseRIV FileToL2(FILE *data); sparseRIV FileToL2(FILE *data);
void consolidateD2S(sparseRIV *destination, int *denseInput); void consolidateD2S(sparseRIV *destination, int *denseInput);
void setKeyData(int RIVsize, int nonZeros, int blockSize); void setKeyData(int RIVsize, int nonZeros, int blockSize);
int* mapS2D(int * destination, sparseRIV input); int* mapS2D(int * destination, sparseRIV input);
int* makeSparseLocations(int *seeds, int seedCount); int* makeSparseLocations(int *seeds, int seedCount);
void makeSeeds(unsigned char* word, int **seeds, int *seedCount); void makeSeeds(unsigned char* word, int **seeds, int *seedCount);
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold); float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold);
void getMagnitudes(sparseRIV *inputs, int RIVCount); void getMagnitudes(sparseRIV *inputs, int RIVCount);
int *mapI2D(int *locations, int seedCount); int *mapI2D(int *locations, int seedCount);
sparseRIV text2L2(unsigned char *text); sparseRIV text2L2(unsigned char *text);
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word); unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);
sparseRIV FileToL2(FILE *data){ sparseRIV FileToL2(FILE *data){
unsigned char *word = (unsigned char*)calloc(2000, 1); unsigned char *word = (unsigned char*)calloc(2000, 1);
int *seeds = RIVKeyData.h_tempBlock; int *seeds = RIVKeyData.h_tempBlock;
int seedCount = 0; int seedCount = 0;
while(fscanf(data, "%s", word)){ while(fscanf(data, "%s", word)){
if(feof(data)){ if(feof(data)){
break; break;
} }
if(!(*word)){ if(!(*word)){
break; break;
} }
makeSeeds(word, &seeds, &seedCount); makeSeeds(word, &seeds, &seedCount);
memset(word, 0, 2000); memset(word, 0, 2000);
} }
int *locations = makeSparseLocations(seeds, seedCount); int *locations = makeSparseLocations(seeds, seedCount);
//printf("mcshittles"); //printf("mcshittles");
int *L2dense; int *L2dense;
L2dense = mapI2D(locations, seedCount); L2dense = mapI2D(locations, seedCount);
sparseRIV output; sparseRIV output;
//printf("tits"); //printf("tits");
consolidateD2S( &output, L2dense); consolidateD2S( &output, L2dense);
free(L2dense); free(L2dense);
output.boolean = 1; output.boolean = 1;
RIVKeyData.thing++; RIVKeyData.thing++;
return output; return output;
} }
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold){ float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold){
int *baseDenseRIV = RIVKeyData.h_tempBlock; int *baseDenseRIV = RIVKeyData.h_tempBlock;
mapS2D(baseDenseRIV, baseRIV); mapS2D(baseDenseRIV, baseRIV);
float *outputs = (float*)malloc((multiplierCount)* sizeof(float)); float *outputs = (float*)malloc((multiplierCount)* sizeof(float));
float *output_slider = outputs; float *output_slider = outputs;
sparseRIV *multipliersStop = multipliers+multiplierCount; sparseRIV *multipliersStop = multipliers+multiplierCount;
float minsize = baseRIV.magnitude * .75; float minsize = baseRIV.magnitude * .75;
float maxsize = baseRIV.magnitude * 1.25; float maxsize = baseRIV.magnitude * 1.25;
while(multipliers<multipliersStop){ while(multipliers<multipliersStop){
if(((*multipliers).boolean) /*&& (((*multipliers).magnitude < maxsize) && ((*multipliers).magnitude > minsize))*/){ if(((*multipliers).boolean) /*&& (((*multipliers).magnitude < maxsize) && ((*multipliers).magnitude > minsize))*/){
int dot = 0; int dot = 0;
int *values = (*multipliers).values; int *values = (*multipliers).values;
int *locations = (*multipliers).locations; int *locations = (*multipliers).locations;
int *locations_Stop = locations+(*multipliers).count; int *locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){ while(locations<locations_Stop){
dot += (*values)*(*(baseDenseRIV+(*locations))); dot += (*values)*(*(baseDenseRIV+(*locations)));
locations++; locations++;
values++; values++;
} }
*output_slider= dot/((baseRIV.magnitude)*((*multipliers).magnitude)); *output_slider= dot/((baseRIV.magnitude)*((*multipliers).magnitude));
if(*output_slider>=threshold){ if(*output_slider>=threshold){
printf("%s\t%s\n%f\n", (*multipliers).name, baseRIV.name, *output_slider); printf("%s\t%s\n%f\n", (*multipliers).name, baseRIV.name, *output_slider);
(*multipliers).boolean = 0; (*multipliers).boolean = 0;
//RIVKeyData.thing ++; //RIVKeyData.thing ++;
} }
} }
multipliers++; multipliers++;
output_slider++; output_slider++;
} }
return outputs; return outputs;
} }
void getMagnitudes(sparseRIV *inputs, int RIVCount){ void getMagnitudes(sparseRIV *inputs, int RIVCount){
for(int i=0; i<RIVCount; i++){ for(int i=0; i<RIVCount; i++){
int temp = 0; int temp = 0;
int *values = inputs[i].values; int *values = inputs[i].values;
int *values_stop = values+inputs[i].count; int *values_stop = values+inputs[i].count;
while(values<values_stop){ while(values<values_stop){
temp += (*values)*(*values); temp += (*values)*(*values);
values++; values++;
} }
float magnitude = sqrt(temp); float magnitude = sqrt(temp);
inputs[i].magnitude = magnitude; inputs[i].magnitude = magnitude;
//printf("magnitude = %f, \n", magnitude); //printf("magnitude = %f, \n", magnitude);
} }
} }
int* mapS2D(int* destination, sparseRIV input){ int* mapS2D(int* destination, sparseRIV input){
memset(destination, 0, RIVKeyData.RIVsize*sizeof(int)); memset(destination, 0, RIVKeyData.RIVsize*sizeof(int));
int *locations_slider = input.locations; int *locations_slider = input.locations;
int *values_slider = input.values; int *values_slider = input.values;
int *locations_stop = locations_slider+input.count; int *locations_stop = locations_slider+input.count;
while(locations_slider<locations_stop){ while(locations_slider<locations_stop){
destination[*locations_slider] = *values_slider; destination[*locations_slider] = *values_slider;
locations_slider++; locations_slider++;
values_slider++; values_slider++;
} }
//HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_OpenSlot, destination, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyHostToDevice)); //HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_OpenSlot, destination, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyHostToDevice));
return destination; return destination;
} }
int* mapI2D(int *locations, int valueCount){ int* mapI2D(int *locations, int valueCount){
int *destination = (int*)calloc(RIVKeyData.RIVsize,sizeof(int)); int *destination = (int*)calloc(RIVKeyData.RIVsize,sizeof(int));
int *locations_slider = locations; int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount; int *locations_stop = locations_slider+valueCount;
int value = 1; int value = 1;
while(locations_slider<locations_stop){ while(locations_slider<locations_stop){
destination[*locations_slider] +=value; destination[*locations_slider] +=value;
locations_slider++; locations_slider++;
value = (value == 1)? -1: 1; value = (value == 1)? -1: 1;
} }
return destination; return destination;
} }
void consolidateD2S(sparseRIV *destination, int *denseInput){ void consolidateD2S(sparseRIV *destination, int *denseInput){
int count = 0; int count = 0;
(*destination).locations = (int*) malloc(RIVKeyData.RIVsize*sizeof(int)); (*destination).locations = (int*) malloc(RIVKeyData.RIVsize*sizeof(int));
(*destination).values = (int*) malloc(RIVKeyData.RIVsize*sizeof(int)); (*destination).values = (int*) malloc(RIVKeyData.RIVsize*sizeof(int));
for(int i=0; i<RIVKeyData.RIVsize; i++){ for(int i=0; i<RIVKeyData.RIVsize; i++){
if(denseInput[i]){ if(denseInput[i]){
(*destination).locations[count] = i; (*destination).locations[count] = i;
(*destination).values[count] = denseInput[i]; (*destination).values[count] = denseInput[i];
count++; count++;
} }
} }
destination->count = count; destination->count = count;
(*destination).locations = (int*) realloc((*destination).locations, (*destination).count*sizeof(int)); (*destination).locations = (int*) realloc((*destination).locations, (*destination).count*sizeof(int));
(*destination).values = (int*) realloc((*destination).values, (*destination).count*sizeof(int)); (*destination).values = (int*) realloc((*destination).values, (*destination).count*sizeof(int));
} }
void setKeyData(int RIVsize, int nonZeros, int blockSize){ void setKeyData(int RIVsize, int nonZeros, int blockSize){
RIVKeyData.RIVsize = RIVsize; RIVKeyData.RIVsize = RIVsize;
if(nonZeros%2){ if(nonZeros%2){
printf("your nonZeros must be an even number"); printf("your nonZeros must be an even number");
nonZeros++; nonZeros++;
printf(", changed to %d", nonZeros); printf(", changed to %d", nonZeros);
} }
RIVKeyData.nonZeros = nonZeros; RIVKeyData.nonZeros = nonZeros;
RIVKeyData.masks = (long long int*)malloc(nonZeros*sizeof(long long int)); RIVKeyData.masks = (long long int*)malloc(nonZeros*sizeof(long long int));
for(int i = 0; i<nonZeros; i++){ for(int i = 0; i<nonZeros; i++){
RIVKeyData.masks[i] = SEEDMASK>>(5*i); RIVKeyData.masks[i] = SEEDMASK>>(5*i);
} }
RIVKeyData.h_tempBlock = (int*)malloc(blockSize*sizeof(int)); RIVKeyData.h_tempBlock = (int*)malloc(blockSize*sizeof(int));
//RIVKeyData.h_stagingBlock = (int*)malloc(blockSize*sizeof(int)); //RIVKeyData.h_stagingBlock = (int*)malloc(blockSize*sizeof(int));
//RIVKeyData.h_staging_slider = RIVKeyData.h_stagingBlock; //RIVKeyData.h_staging_slider = RIVKeyData.h_stagingBlock;
RIVKeyData.thing = 0; RIVKeyData.thing = 0;
} }
void makeSeeds(unsigned char* word, int **seeds, int *seedCount){ void makeSeeds(unsigned char* word, int **seeds, int *seedCount){
int i=0; int i=0;
int seedbase = 0; int seedbase = 0;
while(*word){ while(*word){
seedbase += (*(word))<<(i*5); seedbase += (*(word))<<(i*5);
word++; word++;
i++; i++;
} }
int *seedTrack = (*seeds)+*seedCount; int *seedTrack = (*seeds)+*seedCount;
for(i =0 ; i<RIVKeyData.nonZeros; i++){ for(i =0 ; i<RIVKeyData.nonZeros; i++){
*seedTrack = (seedbase>>i)+(3*i); *seedTrack = (seedbase>>i)+(3*i);
seedTrack++; seedTrack++;
} }
*seedCount+=RIVKeyData.nonZeros; *seedCount+=RIVKeyData.nonZeros;
return; return;
} }
int* makeSparseLocations(int* seeds, int seedCount){ int* makeSparseLocations(int* seeds, int seedCount){
int *locations = RIVKeyData.h_tempBlock; int *locations = RIVKeyData.h_tempBlock;
int *locations_slider = locations; int *locations_slider = locations;
int *seeds_stop = seeds+seedCount; int *seeds_stop = seeds+seedCount;
long long int *mask = RIVKeyData.masks; long long int *mask = RIVKeyData.masks;
long long int *mask_stop = mask+RIVKeyData.nonZeros; long long int *mask_stop = mask+RIVKeyData.nonZeros;
while(seeds<seeds_stop){ while(seeds<seeds_stop){
*locations_slider =(((*seeds)^(*mask)) & 2147483647) %(RIVKeyData.RIVsize); *locations_slider =(((*seeds)^(*mask)) & 2147483647) %(RIVKeyData.RIVsize);
mask++; mask++;
locations_slider++; locations_slider++;
seeds++; seeds++;
if(!(mask<mask_stop)) mask-=RIVKeyData.nonZeros; if(!(mask<mask_stop)) mask-=RIVKeyData.nonZeros;
} }
return locations; return locations;
} }
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word){ unsigned char *sscanAdvance(unsigned char **string, unsigned char *word){
unsigned char *word_slider = word; unsigned char *word_slider = word;
while(*(*string)){ while(*(*string)){
if(*(*string) == ' ') { if(*(*string) == ' ') {
(*string)++; (*string)++;
break; break;
} }
*word_slider = *(*string); *word_slider = *(*string);
word_slider++; word_slider++;
(*string)++; (*string)++;
} }
*word_slider = 0; *word_slider = 0;
return word; return word;
} }
sparseRIV text2L2(unsigned char *text){ sparseRIV text2L2(unsigned char *text){
unsigned char *word = (unsigned char*)calloc(2000, 1); unsigned char *word = (unsigned char*)calloc(2000, 1);
int *seeds = ( int*)malloc(RIVKeyData.nonZeros*sizeof( int)); int *seeds = ( int*)malloc(RIVKeyData.nonZeros*sizeof( int));
unsigned char *text_slider = text; unsigned char *text_slider = text;
int seedCount = 0; int seedCount = 0;
while(*text_slider){ while(*text_slider){
sscanAdvance(&text_slider, word); sscanAdvance(&text_slider, word);
makeSeeds(word, &seeds, &seedCount); makeSeeds(word, &seeds, &seedCount);
memset(word, 0, 2000); memset(word, 0, 2000);
} }
int *locations = makeSparseLocations(seeds, seedCount); int *locations = makeSparseLocations(seeds, seedCount);
int *L2dense; int *L2dense;
L2dense = mapI2D(locations, seedCount); L2dense = mapI2D(locations, seedCount);
free(locations); free(locations);
sparseRIV output; sparseRIV output;
consolidateD2S(&output, L2dense); consolidateD2S(&output, L2dense);
free(seeds); free(seeds);
return output; return output;
} }
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <strsafe.h> #include <strsafe.h>
#define SEEDMASK 25214903917 #define SEEDMASK 25214903917
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) #define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
static void HandleError(cudaError_t err, const char *file, int line){ static void HandleError(cudaError_t err, const char *file, int line){
if(err !=cudaSuccess) if(err !=cudaSuccess)
{ {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line); printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
__global__ void squirt(float *d_magnitudes, int N){ __global__ void squirt(float *d_magnitudes, int N){
int id =(blockIdx.x*blockDim.x + threadIdx.x); int id =(blockIdx.x*blockDim.x + threadIdx.x);
if(id>=N) return; if(id>=N) return;
d_magnitudes[id] = sqrt(d_magnitudes[id]); d_magnitudes[id] = sqrt(d_magnitudes[id]);
} }
__global__ void generateLocations(int *d_seeds, long long int mask, int *d_locations, int RIVsize, int team, int seedCount, int nonZeros){ __global__ void generateLocations(int *d_seeds, long long int mask, int *d_locations, int RIVsize, int team, int seedCount, int nonZeros){
int id =nonZeros*(blockIdx.x*blockDim.x + threadIdx.x)+team; int id =nonZeros*(blockIdx.x*blockDim.x + threadIdx.x)+team;
if(id>=seedCount) return; if(id>=seedCount) return;
d_locations[id] = ((d_seeds[id]^mask) & 2147483647) %(RIVsize); d_locations[id] = ((d_seeds[id]^mask) & 2147483647) %(RIVsize);
} }
__global__ void D2S( int* d_DenseRIV, int* d_SparseValues, int* d_SparseLocations, int *d_NZCount, int d_DenseSize){ __global__ void D2S( int* d_DenseRIV, int* d_SparseValues, int* d_SparseLocations, int *d_NZCount, int d_DenseSize){
int id =(blockIdx.x*blockDim.x + threadIdx.x); int id =(blockIdx.x*blockDim.x + threadIdx.x);
if(id>=d_DenseSize) return; if(id>=d_DenseSize) return;
int value = *(d_DenseRIV+id); int value = *(d_DenseRIV+id);
if(!value) return; if(!value) return;
int sparseSlot = atomicAdd(d_NZCount, 1); int sparseSlot = atomicAdd(d_NZCount, 1);
*(d_SparseValues+sparseSlot) = value; *(d_SparseValues+sparseSlot) = value;
*(d_SparseLocations+sparseSlot) = id; *(d_SparseLocations+sparseSlot) = id;
} }
__global__ void S2D(int *d_locations, int *d_values, int *d_OpenSlot, int numberOfValues){ __global__ void S2D(int *d_locations, int *d_values, int *d_OpenSlot, int numberOfValues){
int id = blockIdx.x*blockDim.x + threadIdx.x; int id = blockIdx.x*blockDim.x + threadIdx.x;
if(id>=numberOfValues) return ; if(id>=numberOfValues) return ;
atomicAdd( d_OpenSlot + *(d_locations+id) , *(d_values+id)); atomicAdd( d_OpenSlot + *(d_locations+id) , *(d_values+id));
} }
__global__ void I2D(int *d_locations, int *d_OpenSlot, int numberOfValues){ __global__ void I2D(int *d_locations, int *d_OpenSlot, int numberOfValues){
int id = blockIdx.x*blockDim.x + threadIdx.x; int id = blockIdx.x*blockDim.x + threadIdx.x;
//bitshift //bitshift
int value = (id%2) ? -1: 1; int value = (id%2) ? -1: 1;
if(id>=numberOfValues) return ; if(id>=numberOfValues) return ;
atomicAdd( d_OpenSlot + *(d_locations+id) , value); atomicAdd( d_OpenSlot + *(d_locations+id) , value);
} }
void consolidateD2SStaged(sparseRIV *destination, int *denseInput); void consolidateD2SStaged(sparseRIV *destination, int *denseInput);
void consolidateD2S_d(sparseRIV *destination, int *denseInput); void consolidateD2S_d(sparseRIV *destination, int *denseInput);
void setKeyData_d(int RIVsize, int nonZeros, int blockSize); void setKeyData_d(int RIVsize, int nonZeros, int blockSize);
int* mapS2D_d(int * destination, sparseRIV input); int* mapS2D_d(int * destination, sparseRIV input);
float *getMagnitudes_d(sparseRIV *inputs, int RIVCount); float *getMagnitudes_d(sparseRIV *inputs, int RIVCount);
int *mapI2D_d(int *locations, int seedCount); int *mapI2D_d(int *locations, int seedCount);
int* makeSparseLocations_d(int* seeds, int seedCount); int* makeSparseLocations_d(int* seeds, int seedCount);
float *getMagnitudes_d(sparseRIV *inputs, int RIVCount){ float *getMagnitudes_d(sparseRIV *inputs, int RIVCount){
float *magnitudes; float *magnitudes;
HANDLE_ERROR (cudaMallocHost((float**)&magnitudes,RIVCount*sizeof(float))); HANDLE_ERROR (cudaMallocHost((float**)&magnitudes,RIVCount*sizeof(float)));
float *magnitudes_slider = magnitudes; float *magnitudes_slider = magnitudes;
for(int i=0; i<RIVCount; i++){ for(int i=0; i<RIVCount; i++){
int temp = 0; int temp = 0;
int *values = inputs[i].values; int *values = inputs[i].values;
int *values_stop = values+inputs[i].count; int *values_stop = values+inputs[i].count;
while(values<values_stop){ while(values<values_stop){
temp += (*values)*(*values); temp += (*values)*(*values);
values++; values++;
} }
*magnitudes_slider = temp; *magnitudes_slider = temp;
magnitudes_slider++; magnitudes_slider++;
} }
HANDLE_ERROR (cudaMalloc((void**)&RIVKeyData.d_magnitudes, RIVCount*sizeof(float))); HANDLE_ERROR (cudaMalloc((void**)&RIVKeyData.d_magnitudes, RIVCount*sizeof(float)));
HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_magnitudes, magnitudes, RIVCount*sizeof(float), cudaMemcpyHostToDevice)); HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_magnitudes, magnitudes, RIVCount*sizeof(float), cudaMemcpyHostToDevice));
int blockSize; int blockSize;
int minGridSize = 0; int minGridSize = 0;
int gridSize; int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, squirt); cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, squirt);
gridSize = ((RIVCount + blockSize -1) / blockSize)+1; gridSize = ((RIVCount + blockSize -1) / blockSize)+1;
squirt<<<gridSize,blockSize >>> (RIVKeyData.d_magnitudes, RIVCount); squirt<<<gridSize,blockSize >>> (RIVKeyData.d_magnitudes, RIVCount);
HANDLE_ERROR (cudaMemcpy (magnitudes, RIVKeyData.d_magnitudes, RIVCount*sizeof(float), cudaMemcpyDeviceToHost)); HANDLE_ERROR (cudaMemcpy (magnitudes, RIVKeyData.d_magnitudes, RIVCount*sizeof(float), cudaMemcpyDeviceToHost));
magnitudes_slider = magnitudes; magnitudes_slider = magnitudes;
for(int i=0; i<RIVCount; i++){ for(int i=0; i<RIVCount; i++){
inputs[i].magnitude = *magnitudes_slider; inputs[i].magnitude = *magnitudes_slider;
magnitudes_slider++; magnitudes_slider++;
} }
return magnitudes; return magnitudes;
} }
int *mapS2D_d(int* destination, sparseRIV input){ int *mapS2D_d(int* destination, sparseRIV input){
int *d_locations = RIVKeyData.d_OpenSlot+RIVKeyData.RIVsize; int *d_locations = RIVKeyData.d_OpenSlot+RIVKeyData.RIVsize;
int *d_values = d_locations+input.count; int *d_values = d_locations+input.count;
HANDLE_ERROR (cudaMemset (RIVKeyData.d_OpenSlot, 0, RIVKeyData.RIVsize*sizeof(int))); HANDLE_ERROR (cudaMemset (RIVKeyData.d_OpenSlot, 0, RIVKeyData.RIVsize*sizeof(int)));
HANDLE_ERROR (cudaMemcpy (d_locations, input.locations, input.count*sizeof(int), cudaMemcpyHostToDevice)); HANDLE_ERROR (cudaMemcpy (d_locations, input.locations, input.count*sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMemcpy (d_values, input.values, input.count*sizeof(int), cudaMemcpyHostToDevice)); HANDLE_ERROR (cudaMemcpy (d_values, input.values, input.count*sizeof(int), cudaMemcpyHostToDevice));
int blockSize; int blockSize;
int minGridSize = 0; int minGridSize = 0;
int gridSize; int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, S2D); cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, S2D);
gridSize = ((input.count + blockSize -1) / blockSize)+1; gridSize = ((input.count + blockSize -1) / blockSize)+1;
S2D <<<gridSize,blockSize>>> (d_locations, d_values, RIVKeyData.d_OpenSlot, input.count); S2D <<<gridSize,blockSize>>> (d_locations, d_values, RIVKeyData.d_OpenSlot, input.count);
HANDLE_ERROR (cudaMemcpy (destination, RIVKeyData.d_OpenSlot, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyDeviceToHost)); HANDLE_ERROR (cudaMemcpy (destination, RIVKeyData.d_OpenSlot, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyDeviceToHost));
return destination; return destination;
} }
int* mapI2D_d(int *locations, int valueCount){ int* mapI2D_d(int *locations, int valueCount){
int *d_locations = RIVKeyData.d_OpenSlot+RIVKeyData.RIVsize; int *d_locations = RIVKeyData.d_OpenSlot+RIVKeyData.RIVsize;
HANDLE_ERROR (cudaMemset (RIVKeyData.d_OpenSlot, 0, RIVKeyData.RIVsize*sizeof(int))); HANDLE_ERROR (cudaMemset (RIVKeyData.d_OpenSlot, 0, RIVKeyData.RIVsize*sizeof(int)));
HANDLE_ERROR (cudaMemcpy (d_locations, locations, valueCount*sizeof(int), cudaMemcpyHostToDevice)); HANDLE_ERROR (cudaMemcpy (d_locations, locations, valueCount*sizeof(int), cudaMemcpyHostToDevice));
int blockSize; int blockSize;
int minGridSize = 0; int minGridSize = 0;
int gridSize; int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, I2D); cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, I2D);
gridSize = ((valueCount + blockSize -1) / blockSize)+1; gridSize = ((valueCount + blockSize -1) / blockSize)+1;
I2D <<<gridSize,blockSize>>> (d_locations, RIVKeyData.d_OpenSlot, valueCount); I2D <<<gridSize,blockSize>>> (d_locations, RIVKeyData.d_OpenSlot, valueCount);
int* valuesOut = RIVKeyData.h_tempBlock; int* valuesOut = RIVKeyData.h_tempBlock;
HANDLE_ERROR (cudaMemcpy (valuesOut, RIVKeyData.d_OpenSlot, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyDeviceToHost)); HANDLE_ERROR (cudaMemcpy (valuesOut, RIVKeyData.d_OpenSlot, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyDeviceToHost));
return valuesOut; return valuesOut;
} }
void consolidateD2SStaged(sparseRIV *destination, int *denseInput){ void consolidateD2SStaged(sparseRIV *destination, int *denseInput){
int count = 0; int count = 0;
int *locations = RIVKeyData.h_tempBlock; int *locations = RIVKeyData.h_tempBlock;
int *values = RIVKeyData.h_tempBlock + RIVKeyData.RIVsize; int *values = RIVKeyData.h_tempBlock + RIVKeyData.RIVsize;
for(int i=0; i<RIVKeyData.RIVsize; i++){ for(int i=0; i<RIVKeyData.RIVsize; i++){
if(denseInput[i]){ if(denseInput[i]){
locations[count] = i; locations[count] = i;
values[count] = denseInput[i]; values[count] = denseInput[i];
count++; count++;
} }
} }
int *locations_slider = locations+count; int *locations_slider = locations+count;
while(locations_slider>=locations){ while(locations_slider>=locations){
RIVKeyData.h_staging_slider--; RIVKeyData.h_staging_slider--;
locations_slider--; locations_slider--;
*RIVKeyData.h_staging_slider = *locations_slider; *RIVKeyData.h_staging_slider = *locations_slider;
} }
(*destination).locations = RIVKeyData.h_staging_slider; (*destination).locations = RIVKeyData.h_staging_slider;
int *values_slider = values+count; int *values_slider = values+count;
while(values_slider>=values){ while(values_slider>=values){
RIVKeyData.h_staging_slider--; RIVKeyData.h_staging_slider--;
values_slider--; values_slider--;
*RIVKeyData.h_staging_slider = *values_slider; *RIVKeyData.h_staging_slider = *values_slider;
} }
(*destination).values = RIVKeyData.h_staging_slider; (*destination).values = RIVKeyData.h_staging_slider;
RIVKeyData.h_staging_slider--; RIVKeyData.h_staging_slider--;
*RIVKeyData.h_staging_slider = count; *RIVKeyData.h_staging_slider = count;
*RIVKeyData.h_displacements = RIVKeyData.h_staging_slider -RIVKeyData.h_stagingBlock; *RIVKeyData.h_displacements = RIVKeyData.h_staging_slider -RIVKeyData.h_stagingBlock;
RIVKeyData.h_displacements++; RIVKeyData.h_displacements++;
} }
void consolidateD2S_d(sparseRIV *destination, int *denseInput){ void consolidateD2S_d(sparseRIV *destination, int *denseInput){
int *d_valueCount; int *d_valueCount;
HANDLE_ERROR (cudaMalloc((void**)&d_valueCount, sizeof(int))); HANDLE_ERROR (cudaMalloc((void**)&d_valueCount, sizeof(int)));
HANDLE_ERROR(cudaMemset(d_valueCount, 0, sizeof(int))); HANDLE_ERROR(cudaMemset(d_valueCount, 0, sizeof(int)));
HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_OpenSlot, denseInput, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyHostToDevice)); HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_OpenSlot, denseInput, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyHostToDevice));
int *d_outValues = RIVKeyData.d_OpenSlot+RIVKeyData.RIVsize; int *d_outValues = RIVKeyData.d_OpenSlot+RIVKeyData.RIVsize;
int *d_outLocations = d_outValues+RIVKeyData.RIVsize; int *d_outLocations = d_outValues+RIVKeyData.RIVsize;
int blockSize; int blockSize;
int minGridSize = 0; int minGridSize = 0;
int gridSize; int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, D2S); cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, D2S);
gridSize = ((RIVKeyData.RIVsize + blockSize -1) / blockSize)+1; gridSize = ((RIVKeyData.RIVsize + blockSize -1) / blockSize)+1;
D2S <<<gridSize,blockSize>>> (RIVKeyData.d_OpenSlot, d_outValues, d_outLocations, d_valueCount, RIVKeyData.RIVsize); D2S <<<gridSize,blockSize>>> (RIVKeyData.d_OpenSlot, d_outValues, d_outLocations, d_valueCount, RIVKeyData.RIVsize);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
HANDLE_ERROR (cudaMemcpy (&(*destination).count, d_valueCount, sizeof(int), cudaMemcpyDeviceToHost)); HANDLE_ERROR (cudaMemcpy (&(*destination).count, d_valueCount, sizeof(int), cudaMemcpyDeviceToHost));
(*destination).locations = RIVKeyData.h_staging_slider; (*destination).locations = RIVKeyData.h_staging_slider;
RIVKeyData.h_staging_slider+=(*destination).count; RIVKeyData.h_staging_slider+=(*destination).count;
(*destination).values = RIVKeyData.h_staging_slider; (*destination).values = RIVKeyData.h_staging_slider;
RIVKeyData.h_staging_slider+=(*destination).count; RIVKeyData.h_staging_slider+=(*destination).count;
HANDLE_ERROR (cudaMemcpy ((*destination).values, d_outValues, ((*destination).count)*sizeof(int), cudaMemcpyDeviceToHost)); HANDLE_ERROR (cudaMemcpy ((*destination).values, d_outValues, ((*destination).count)*sizeof(int), cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaMemcpy ((*destination).locations, d_outLocations, ((*destination).count)*sizeof(int), cudaMemcpyDeviceToHost)); HANDLE_ERROR (cudaMemcpy ((*destination).locations, d_outLocations, ((*destination).count)*sizeof(int), cudaMemcpyDeviceToHost));
cudaFree(d_valueCount); cudaFree(d_valueCount);
} }
void setKeyData_d(int RIVsize, int nonZeros, int blockSize){ void setKeyData_d(int RIVsize, int nonZeros, int blockSize){
RIVKeyData.RIVsize = RIVsize; RIVKeyData.RIVsize = RIVsize;
if(nonZeros%2){ if(nonZeros%2){
printf("your nonZeros must be an even number"); printf("your nonZeros must be an even number");
nonZeros++; nonZeros++;
printf(", changed to %d", nonZeros); printf(", changed to %d", nonZeros);
} }
RIVKeyData.nonZeros = nonZeros; RIVKeyData.nonZeros = nonZeros;
RIVKeyData.masks = (long long int*)malloc(nonZeros*sizeof(long long int)); RIVKeyData.masks = (long long int*)malloc(nonZeros*sizeof(long long int));
for(int i = 0; i<nonZeros; i++){ for(int i = 0; i<nonZeros; i++){
RIVKeyData.masks[i] = SEEDMASK>>(5*i); RIVKeyData.masks[i] = SEEDMASK>>(5*i);
} }
HANDLE_ERROR (cudaMallocHost((void**)&RIVKeyData.h_tempBlock, blockSize*sizeof(int))); HANDLE_ERROR (cudaMallocHost((void**)&RIVKeyData.h_tempBlock, blockSize*sizeof(int)));
HANDLE_ERROR (cudaMallocHost((void**)&RIVKeyData.h_stagingBlock, blockSize*sizeof(int))); HANDLE_ERROR (cudaMallocHost((void**)&RIVKeyData.h_stagingBlock, blockSize*sizeof(int)));
RIVKeyData.h_staging_stop = RIVKeyData.h_stagingBlock + blockSize; RIVKeyData.h_staging_stop = RIVKeyData.h_stagingBlock + blockSize;
RIVKeyData.h_staging_slider = RIVKeyData.h_staging_stop; RIVKeyData.h_staging_slider = RIVKeyData.h_staging_stop;
RIVKeyData.h_displacements = RIVKeyData.h_stagingBlock; RIVKeyData.h_displacements = RIVKeyData.h_stagingBlock;
HANDLE_ERROR (cudaMalloc((void**)&RIVKeyData.d_OpenSlot, blockSize*sizeof(int))); HANDLE_ERROR (cudaMalloc((void**)&RIVKeyData.d_OpenSlot, blockSize*sizeof(int)));
RIVKeyData.d_SlotEnd = RIVKeyData.d_OpenSlot+blockSize; RIVKeyData.d_SlotEnd = RIVKeyData.d_OpenSlot+blockSize;
RIVKeyData.thing = 0; RIVKeyData.thing = 0;
} }
int* makeSparseLocations_d(int* seeds, int seedCount){ int* makeSparseLocations_d(int* seeds, int seedCount){
int *d_locations = RIVKeyData.d_OpenSlot; int *d_locations = RIVKeyData.d_OpenSlot;
int *d_seeds = d_locations+seedCount; int *d_seeds = d_locations+seedCount;
HANDLE_ERROR (cudaMemcpy(d_seeds, seeds, seedCount*sizeof(int), cudaMemcpyHostToDevice)); HANDLE_ERROR (cudaMemcpy(d_seeds, seeds, seedCount*sizeof(int), cudaMemcpyHostToDevice));
int blockSize; int blockSize;
int minGridSize = 0; int minGridSize = 0;
int gridSize; int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, generateLocations); cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, generateLocations);
gridSize = ((seedCount + blockSize -1) / (RIVKeyData.nonZeros*blockSize))+1; gridSize = ((seedCount + blockSize -1) / (RIVKeyData.nonZeros*blockSize))+1;
long long int *mask = RIVKeyData.masks; long long int *mask = RIVKeyData.masks;
for(int team=0; team<RIVKeyData.nonZeros; team++){ for(int team=0; team<RIVKeyData.nonZeros; team++){
generateLocations <<<gridSize,blockSize,team>>> (d_seeds, *mask, d_locations, RIVKeyData.RIVsize, team, seedCount, RIVKeyData.nonZeros); generateLocations <<<gridSize,blockSize,team>>> (d_seeds, *mask, d_locations, RIVKeyData.RIVsize, team, seedCount, RIVKeyData.nonZeros);
mask++; mask++;
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
int *locations = RIVKeyData.h_tempBlock; int *locations = RIVKeyData.h_tempBlock;
HANDLE_ERROR (cudaMemcpy(locations, d_locations, seedCount*sizeof(int), cudaMemcpyDeviceToHost)); HANDLE_ERROR (cudaMemcpy(locations, d_locations, seedCount*sizeof(int), cudaMemcpyDeviceToHost));
return locations; return locations;
} }
void addS2DsBlocked(int *denseBlock, sparseRIV additive, int RIVCount){ void addS2DsBlocked(int *denseBlock, sparseRIV additive, int RIVCount){
int *d_locations= RIVKeyData.d_OpenSlot+RIVCount*RIVKeyData.RIVsize; int *d_locations= RIVKeyData.d_OpenSlot+RIVCount*RIVKeyData.RIVsize;
int *d_values = d_locations+additive.count; int *d_values = d_locations+additive.count;
HANDLE_ERROR (cudaMemcpy (d_locations, additive.locations, additive.count*sizeof(int), cudaMemcpyHostToDevice)); HANDLE_ERROR (cudaMemcpy (d_locations, additive.locations, additive.count*sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMemcpy (d_values, additive.values, additive.count*sizeof(int), cudaMemcpyHostToDevice)); HANDLE_ERROR (cudaMemcpy (d_values, additive.values, additive.count*sizeof(int), cudaMemcpyHostToDevice));
int blockSize; int blockSize;
int minGridSize = 0; int minGridSize = 0;
int gridSize; int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, S2Ds); cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, S2Ds);
gridSize = ((additive.count + blockSize -1) / blockSize)+1; gridSize = ((additive.count + blockSize -1) / blockSize)+1;
S2Ds<<<additive.count,1>>>(RIVKeyData.d_OpenSlot, d_locations, d_values, additive.count, RIVCount, RIVKeyData.RIVsize); S2Ds<<<additive.count,1>>>(RIVKeyData.d_OpenSlot, d_locations, d_values, additive.count, RIVCount, RIVKeyData.RIVsize);
HANDLE_ERROR (cudaMemcpy (denseBlock, RIVKeyData.d_OpenSlot, RIVCount*RIVKeyData.RIVsize*sizeof(int), cudaMemcpyDeviceToHost)); HANDLE_ERROR (cudaMemcpy (denseBlock, RIVKeyData.d_OpenSlot, RIVCount*RIVKeyData.RIVsize*sizeof(int), cudaMemcpyDeviceToHost));
} }
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <math.h> #include <math.h>
#include "RIVLowerMorphic.h" #include "RIVLowerMorphic.h"
#include "RIVaccessories.h" #include "RIVaccessories.h"
/* lexPush writes a denseRIV to a file for permanent storage */ /* lexPush writes a denseRIV to a file for permanent storage */
int lexPush(denseRIV RIVout); int lexPush(denseRIV RIVout);
/* lexPull reads an existing lexicon entry (under directory "lexicon") /* lexPull reads an existing lexicon entry (under directory "lexicon")
* and creates a denseRIV with those attributes. * and creates a denseRIV with those attributes.
* if the file does not exist, it creates a 0 vector with the name of word * if the file does not exist, it creates a 0 vector with the name of word
*/ */
denseRIV lexPull(char* word); denseRIV lexPull(char* word);
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n") /* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each * and returns a sparse RIV which is the vector sum of the base RIVs of each
* word contained * word contained
*/ */
sparseRIV fileToL2(FILE *input); sparseRIV fileToL2(FILE *input);
/* fileToL2Clean operates the same as fileToL2 butkeeps only words /* fileToL2Clean operates the same as fileToL2 butkeeps only words
* containing lowercase letters and the '_' symbol * containing lowercase letters and the '_' symbol
* this is important if you will be lexPush-ing those words later * this is important if you will be lexPush-ing those words later
*/ */
sparseRIV fileToL2Clean(FILE *data); sparseRIV fileToL2Clean(FILE *data);
sparseRIV fileToL2direct(FILE *data); sparseRIV fileToL2direct(FILE *data);
/*cosine determines the "similarity" between two RIVs. */ /*cosine determines the "similarity" between two RIVs. */
float cosCompare(denseRIV baseRIV, sparseRIV comparator); float cosCompare(denseRIV baseRIV, sparseRIV comparator);
sparseRIV wordtoL2(char* word); sparseRIV wordtoL2(char* word);
sparseRIV consolidateI2S(int *implicit, size_t valueCount); sparseRIV consolidateI2S(int *implicit, size_t valueCount);
sparseRIV text2L2(char *text); sparseRIV text2L2(char *text);
sparseRIV text2L2(char *text){ sparseRIV text2L2(char *text){
unsigned int blockSize; unsigned int blockSize;
char word[100] = {0}; char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved /* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */ * to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock; int *locations = RIVKey.h_tempBlock;
unsigned int locationCount = 0; unsigned int locationCount = 0;
int displacement; int displacement;
while(sscanf(text, "%99s%n", word, &displacement)){ while(sscanf(text, "%99s%n", word, &displacement)){
text += displacement+1; text += displacement+1;
if(!displacement){ if(!displacement){
break; break;
} }
if(!(*word)){ if(!(*word)){
break; break;
} }
blockSize = locationCount+NONZEROS; blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */ /* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){ if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int)); RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock; locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS; RIVKey.tempSize+=NONZEROS;
} }
/* add word's L1 RIV to the accumulating implicit RIV */ /* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations((unsigned char*)word, locations, locationCount); makeSparseLocations((unsigned char*)word, locations, locationCount);
locationCount+= NONZEROS; locationCount+= NONZEROS;
} }
sparseRIV output = consolidateI2S(locations, locationCount); sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */ /* frequency records the number of words in this file */
*(output.frequency) = locationCount/NONZEROS; *(output.frequency) = locationCount/NONZEROS;
output.boolean = 1; output.boolean = 1;
return output; return output;
} }
sparseRIV fileToL2(FILE *data){ sparseRIV fileToL2(FILE *data){
unsigned int blockSize; unsigned int blockSize;
unsigned char word[100] = {0}; unsigned char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved /* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */ * to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock; int *locations = RIVKey.h_tempBlock;
int locationCount = 0; int locationCount = 0;
while(fscanf(data, "%99s", word)){ while(fscanf(data, "%99s", word)){
if(feof(data)){ if(feof(data)){
break; break;
} }
if(!(*word)){ if(!(*word)){
break; break;
} }
blockSize = locationCount+NONZEROS; blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */ /* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){ if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int)); RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock; locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS; RIVKey.tempSize+=NONZEROS;
} }
/* add word's L1 RIV to the accumulating implicit RIV */ /* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount); makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS; locationCount+= NONZEROS;
} }
sparseRIV output = consolidateI2S(locations, locationCount); sparseRIV output = consolidateI2S(locations, locationCount);
output.frequency = malloc(1*sizeof(int)); output.frequency = malloc(1*sizeof(int));
/* frequency records the number of words in this file */ /* frequency records the number of words in this file */
*(output.frequency) = locationCount/NONZEROS; *(output.frequency) = locationCount/NONZEROS;
output.boolean = 1; output.boolean = 1;
return output; return output;
} }
sparseRIV fileToL2Clean(FILE *data){ sparseRIV fileToL2Clean(FILE *data){
unsigned char word[100] = {0}; unsigned char word[100] = {0};
int *locations = RIVKey.h_tempBlock; int *locations = RIVKey.h_tempBlock;
unsigned int blockSize; unsigned int blockSize;
int locationCount = 0; int locationCount = 0;
while(fscanf(data, "%99s", word)){ while(fscanf(data, "%99s", word)){
if(feof(data)){ if(feof(data)){
break; break;
} }
if(!(*word)){ if(!(*word)){
break; break;
} }
/* if the word is not clean, skip it */ /* if the word is not clean, skip it */
if(!isWordClean((char*)word)){ if(!isWordClean((char*)word)){
continue; continue;
} }
blockSize = locationCount+NONZEROS; blockSize = locationCount+NONZEROS;
if(blockSize>RIVKey.tempSize){ if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int)); RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock; locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS; RIVKey.tempSize+=NONZEROS;
} }
makeSparseLocations(word, locations, locationCount); makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS; locationCount+= NONZEROS;
} }
sparseRIV output = consolidateI2S(locations, locationCount); sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */ /* frequency records the number of words in this file */
*(output.frequency) = locationCount/NONZEROS; *(output.frequency) = locationCount/NONZEROS;
output.boolean = 1; output.boolean = 1;
return output; return output;
} }
sparseRIV consolidateI2S(int *implicit, size_t valueCount){ sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount>RIVKey.I2SThreshold){ if(valueCount>RIVKey.I2SThreshold){
return consolidateI2SIndirect(implicit, valueCount); return consolidateI2SIndirect(implicit, valueCount);
}else{ }else{
return consolidateI2SDirect(implicit, valueCount); return consolidateI2SDirect(implicit, valueCount);
} }
} }
void aggregateWord2D(denseRIV destination, char* word){ void aggregateWord2D(denseRIV destination, char* word){
//makeSparseLocations((unsigned char*)word, locationSlot, 0); //makeSparseLocations((unsigned char*)word, locationSlot, 0);
srand(wordtoSeed((unsigned char*)word)); srand(wordtoSeed((unsigned char*)word));
for(int i=0; i<NONZEROS; i++){ for(int i=0; i<NONZEROS; i++){
destination.values[(rand()%RIVSIZE)] +=1; destination.values[(rand()%RIVSIZE)] +=1;
destination.values[(rand()%RIVSIZE)] -= 1; destination.values[(rand()%RIVSIZE)] -= 1;
} }
} }
float cosCompare(denseRIV baseRIV, sparseRIV comparator){ float cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0; int dot = 0;
int *values = comparator.values; int *values = comparator.values;
int *locations = comparator.locations; int *locations = comparator.locations;
int *locations_Stop = locations+comparator.count; int *locations_Stop = locations+comparator.count;
while(locations<locations_Stop){ while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine */ /* we calculate the dot-product to derive the cosine */
dot += (*values)*(*(baseRIV.values+(*locations))); dot += (*values)*(*(baseRIV.values+(*locations)));
locations++; locations++;
values++; values++;
} }
float cosine = dot/(baseRIV.magnitude*comparator.magnitude); float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
return cosine; return cosine;
} }
float getMagnitudeSparse(RIV input){ float getMagnitudeSparse(RIV input){
size_t count; size_t count;
if(input.flags & SPARSE){ if(input.flags & SPARSE){
count = input.count; count = input.count;
}else{ }else{
count = RIVSIZE; count = RIVSIZE;
} }
unsigned long long int temp = 0; unsigned long long int temp = 0;
int *values = input.values; int *values = input.values;
int *values_stop = values+count; int *values_stop = values+count;
while(values<values_stop){ while(values<values_stop){
temp += (*values)*(*values); temp += (*values)*(*values);
values++; values++;
} }
float magnitude = sqrt(temp); float magnitude = sqrt(temp);
input.magnitude = magnitude; input.magnitude = magnitude;
return magnitude; return magnitude;
} }
denseRIV lexPull(char* word){ denseRIV lexPull(char* word){
#if CACHESIZE > 0 #if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */ /* if there is a cache, first check if the word is cached */
srand(wordtoSeed((unsigned char*)word)); srand(wordtoSeed((unsigned char*)word));
int hash = rand()%RIVKey.cacheSize; int hash = rand()%RIVKey.cacheSize;
if(!strcmp(word, RIVKey.RIVCache[hash].name)){ if(!strcmp(word, RIVKey.RIVCache[hash].name)){
/* if word is cached, pull from cache and exit */ /* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash]; return RIVKey.RIVCache[hash];
} }
#endif /* CACHESIZE > 0 */ #endif /* CACHESIZE > 0 */
denseRIV output; denseRIV output;
char pathString[200]; char pathString[200];
sprintf(pathString, "lexicon/%s", word); sprintf(pathString, "lexicon/%s", word);
FILE *lexWord = fopen(pathString, "rb"); FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */ /* if this lexicon file already exists */
if(lexWord){ if(lexWord){
/* pull data from file */ /* pull data from file */
output = fLexPull(lexWord); output = fLexPull(lexWord);
fclose(lexWord); fclose(lexWord);
}else{ }else{
/*if file does not exist, return a 0 vector */ /*if file does not exist, return a 0 vector */
output = denseAllocate(); output = denseAllocate();
} }
strcpy(output.name, word); strcpy(output.name, word);
return output; return output;
} }
int lexPush(denseRIV RIVout){ int lexPush(denseRIV RIVout){
//printf("%s\n", (*RIVout).name); //printf("%s\n", (*RIVout).name);
#if CACHESIZE == 0 #if CACHESIZE == 0
fLexPush(RIVout); fLexPush(RIVout);
return 0; return 0;
#else /* CACHESIZE != 0 */ #else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully) /* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated to the cache * either the RIV is still cached, and the data has been updated to the cache
* or the RIV was pushed out from under it, in which case it has already been pushed*/ * or the RIV was pushed out from under it, in which case it has already been pushed*/
if(RIVout.cached){ if(RIVout.cached){
return 0; return 0;
} }
srand(wordtoSeed((unsigned char*)RIVout.name)); srand(wordtoSeed((unsigned char*)RIVout.name));
int hash = rand()%RIVKey.cacheSize; int hash = rand()%RIVKey.cacheSize;
if(!RIVKey.RIVCache[hash].cached){ if(!RIVKey.RIVCache[hash].cached){
RIVKey.RIVCache[hash] = RIVout; RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1; RIVKey.RIVCache[hash].cached = 1;
return 0; return 0;
/*if the current RIV is more frequent than the RIV holding it's slot */ /*if the current RIV is more frequent than the RIV holding it's slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){ }else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
//scanf("%f", &(*RIVout).magnitude); //scanf("%f", &(*RIVout).magnitude);
//printf("%s replacing %s\n", (*RIVout).name, RIVKey.RIVCache[hash].name); //printf("%s replacing %s\n", (*RIVout).name, RIVKey.RIVCache[hash].name);
/* push the current cache entry to a file */ /* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]); int diag = fLexPush(RIVKey.RIVCache[hash]);
/* replace the cache entry with the currrent RIV */ /* replace the cache entry with the currrent RIV */
RIVKey.RIVCache[hash] = RIVout; RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1; RIVKey.RIVCache[hash].cached = 1;
return diag; return diag;
}else{ }else{
/* push current RIV to file */ /* push current RIV to file */
fLexPush(RIVout); fLexPush(RIVout);
} }
return 0; return 0;
#endif /* CACHESIZE == 0 */ #endif /* CACHESIZE == 0 */
} }
sparseRIV fileToL2direct(FILE *data){; sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0}; unsigned char word[100] = {0};
denseRIV denseTemp; denseRIV denseTemp;
// a temporary dense RIV is stored in the tempBlock // a temporary dense RIV is stored in the tempBlock
denseTemp.values = RIVKey.h_tempBlock; denseTemp.values = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int)); memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
int count = 0; int count = 0;
while(fscanf(data, "%99s", word)){ while(fscanf(data, "%99s", word)){
count++; count++;
if(feof(data)){ if(feof(data)){
break; break;
} }
if(!(*word)){ if(!(*word)){
break; break;
} }
// add word's L1 RIV to the accumulating implicit RIV // add word's L1 RIV to the accumulating implicit RIV
aggregateWord2D(denseTemp, (char*)word); aggregateWord2D(denseTemp, (char*)word);
} }
sparseRIV output = consolidateD2S(denseTemp.values); sparseRIV output = consolidateD2S(denseTemp.values);
// frequency records the number of words in this file // frequency records the number of words in this file
*(output.frequency) = count; *(output.frequency) = count;
output.boolean = 1; output.boolean = 1;
return output; return output;
} }
File added
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#include "RIVtoolsCPUlinux.h"
void directoryToL2s(char *rootString);
int main(){
RIVInit();
char rootString[] = "lexicon/";
directoryToL2s(rootString);
}
void directoryToL2s(char *rootString){
sparseRIV fileRIV;
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
denseRIV temp = lexPull(pathString);
fileRIV = consolidateD2S(temp.values);
strcpy(fileRIV.name, pathString);
float count = fileRIV.count;
printf("%s, saturation: %f\n", fileRIV.name, count);
fclose(input);
free(temp.values);
//free(fileRIV.locations);
}
}
}
File added
rabi_noun s._noun de_noun rabide@yahoo.com prospect_noun place_noun home_noun bellaire_noun tx_noun work_noun rabi_noun s._noun de_noun rabide@yahoo.com prospect_noun place_noun home_noun bellaire_noun tx_noun work_noun
objective_noun financial_adjective engineering_noun position_noun energy_noun trading_noun finance_noun objective_noun financial_adjective engineering_noun position_noun energy_noun trading_noun finance_noun
profile_noun over_other ten_noun year_noun diverse_adjective experience_noun risk_noun analysis_noun management_noun energy_noun sector_noun last_adjective four_noun which_other be_verb trading_noun finance_noun profile_noun over_other ten_noun year_noun diverse_adjective experience_noun risk_noun analysis_noun management_noun energy_noun sector_noun last_adjective four_noun which_other be_verb trading_noun finance_noun
analytical_noun quantitative_adjective skill_noun structuring_noun pricing_noun energy_noun derivative_noun analytical_noun quantitative_adjective skill_noun structuring_noun pricing_noun energy_noun derivative_noun
expertise_noun trading_noun derivative_noun development_noun trade_noun analytic_noun exposure_noun management_noun risk_verb structure_verb e&p_noun project_noun finance_noun transaction_noun expertise_noun trading_noun derivative_noun development_noun trade_noun analytic_noun exposure_noun management_noun risk_verb structure_verb e&p_noun project_noun finance_noun transaction_noun
experience_noun shell_noun capital_noun inc._noun houston_noun tx_noun experience_noun shell_noun capital_noun inc._noun houston_noun tx_noun
present_adjective vice_noun president_noun reports_noun chief_noun financial_noun officer_noun responsible_adjective devise_verb strategy_noun manage_verb price_verb market_noun credit_noun risk_noun within_other structured_adjective transaction_noun present_adjective vice_noun president_noun reports_noun chief_noun financial_noun officer_noun responsible_adjective devise_verb strategy_noun manage_verb price_verb market_noun credit_noun risk_noun within_other structured_adjective transaction_noun
design_verb execute_verb oil_noun gas_noun hedge_noun eight_noun domestic_adjective two_noun international_adjective transaction_noun involve_verb over_other million_noun capital_noun risk_noun design_verb execute_verb oil_noun gas_noun hedge_noun eight_noun domestic_adjective two_noun international_adjective transaction_noun involve_verb over_other million_noun capital_noun risk_noun
develop_verb implement_verb framework_noun identification_noun mitigation_noun pricing_noun risk_noun producer_noun finance_noun transaction_noun develop_verb implement_verb framework_noun identification_noun mitigation_noun pricing_noun risk_noun producer_noun finance_noun transaction_noun
provide_verb sophisticated_adjective simulation_noun modeling_noun support_noun financial_adjective engineering_noun solution_noun e&p_noun finance_noun leasing_noun small_adjective business_noun finance_noun provide_verb sophisticated_adjective simulation_noun modeling_noun support_noun financial_adjective engineering_noun solution_noun e&p_noun finance_noun leasing_noun small_adjective business_noun finance_noun
led_verb development_noun computational_adjective infrastructure_noun risk_noun modeling_noun pricing_noun led_verb development_noun computational_adjective infrastructure_noun risk_noun modeling_noun pricing_noun
shell_noun oil_noun products_noun company_noun houston_noun tx_noun shell_noun oil_noun products_noun company_noun houston_noun tx_noun
trade_noun analytics_noun developer_noun derivatives_noun trader_noun traded_noun future_noun option_noun otc_noun derivative_noun crude_adjective oil_noun heating_noun oil_noun gasoline_noun trade_noun analytics_noun developer_noun derivatives_noun trader_noun traded_noun future_noun option_noun otc_noun derivative_noun crude_adjective oil_noun heating_noun oil_noun gasoline_noun
manage_verb net_adjective hydrocarbon_noun exposure_noun company_noun manage_verb net_adjective hydrocarbon_noun exposure_noun company_noun
develop_verb analytic_noun identify_verb speculative_adjective program_noun trading_noun opportunity_noun e.g._other refinery_noun margin_noun protection_noun develop_verb analytic_noun identify_verb speculative_adjective program_noun trading_noun opportunity_noun e.g._other refinery_noun margin_noun protection_noun
carry_verb out_adverb simulation_noun back_adverb testing_noun risk_noun adjusted_adjective performance_noun measurement_noun trading_noun strategy_noun carry_verb out_adverb simulation_noun back_adverb testing_noun risk_noun adjusted_adjective performance_noun measurement_noun trading_noun strategy_noun
price_verb embedded_adjective cap_noun devise_verb strategy_noun option_noun replication_noun dynamic_adjective hedging_noun price_verb embedded_adjective cap_noun devise_verb strategy_noun option_noun replication_noun dynamic_adjective hedging_noun
shell_noun e&p_noun technology_noun company_noun houston_noun tx_noun shell_noun e&p_noun technology_noun company_noun houston_noun tx_noun
senior_noun research_noun engineer_noun research_noun engineer_noun use_verb reliability_noun analysis_noun solve_verb wide_adjective variety_noun engineering_noun problem_noun senior_noun research_noun engineer_noun research_noun engineer_noun use_verb reliability_noun analysis_noun solve_verb wide_adjective variety_noun engineering_noun problem_noun
model_verb environmental_adjective structural_adjective response_noun develop_verb design_noun code_noun criterion_noun carry_verb out_adverb decision_noun analysis_noun under_other uncertainty_noun surface_noun system_noun selection_noun etc._other model_verb environmental_adjective structural_adjective response_noun develop_verb design_noun code_noun criterion_noun carry_verb out_adverb decision_noun analysis_noun under_other uncertainty_noun surface_noun system_noun selection_noun etc._other
deliver_verb enable_verb technology_noun risk-based_adjective design_noun recipe_noun development_noun complex_adjective engineering_noun system_noun range_verb billion-dollar_adjective tension_noun leg_noun platform_noun requalification_noun aging_noun fleet_noun offshore_adjective jacket_noun structure_noun deliver_verb enable_verb technology_noun risk-based_adjective design_noun recipe_noun development_noun complex_adjective engineering_noun system_noun range_verb billion-dollar_adjective tension_noun leg_noun platform_noun requalification_noun aging_noun fleet_noun offshore_adjective jacket_noun structure_noun
brown_adjective root_noun inc._noun houston_noun tx_noun brown_adjective root_noun inc._noun houston_noun tx_noun
naval_noun architect_noun software_noun troubleshooter_noun carry_verb out_adverb naval_noun architectural_noun design_noun motion_noun response_noun modeling_noun downtime_noun analysis_noun environmental_adjective datum_noun base_noun management_noun software_noun development_noun maintenance_noun support_noun offshore_adjective structure_noun design_noun construction_noun naval_noun architect_noun software_noun troubleshooter_noun carry_verb out_adverb naval_noun architectural_noun design_noun motion_noun response_noun modeling_noun downtime_noun analysis_noun environmental_adjective datum_noun base_noun management_noun software_noun development_noun maintenance_noun support_noun offshore_adjective structure_noun design_noun construction_noun
education_noun university_noun california_noun berkeley_noun ca_noun education_noun university_noun california_noun berkeley_noun ca_noun
ph.d._noun naval_noun architecture_noun offshore_noun engineering_noun minor_noun statistics_noun structure_noun thesis_noun offshore_noun structural_noun system_noun reliability_noun wave-load_noun modeling_noun system_noun behavior_noun analysis_noun ph.d._noun naval_noun architecture_noun offshore_noun engineering_noun minor_noun statistics_noun structure_noun thesis_noun offshore_noun structural_noun system_noun reliability_noun wave-load_noun modeling_noun system_noun behavior_noun analysis_noun
probabilistically_adverb model_verb multidimensional_adjective hazard_noun effect_noun performance_noun complicated_adjective system_noun develop_verb methodology_noun characterize_verb system_noun failure_noun risk_noun probabilistically_adverb model_verb multidimensional_adjective hazard_noun effect_noun performance_noun complicated_adjective system_noun develop_verb methodology_noun characterize_verb system_noun failure_noun risk_noun
work_verb research_noun associate_noun reliability_noun marine_noun structures_noun center_noun stanford_noun university_noun consultant_noun offshore_adjective oil_noun gas_noun industry_noun work_verb research_noun associate_noun reliability_noun marine_noun structures_noun center_noun stanford_noun university_noun consultant_noun offshore_adjective oil_noun gas_noun industry_noun
university_noun california_noun berkeley_noun ca_noun university_noun california_noun berkeley_noun ca_noun
m.s._noun naval_noun architecture_noun offshore_noun engineering_noun minor_noun statistics_noun structure_noun thesis_noun simulation_noun random_noun seaway_noun towing_noun tank_noun random_noun walk_verb frequency_noun method_noun m.s._noun naval_noun architecture_noun offshore_noun engineering_noun minor_noun statistics_noun structure_noun thesis_noun simulation_noun random_noun seaway_noun towing_noun tank_noun random_noun walk_verb frequency_noun method_noun
work_verb research_noun assistant_noun develop_verb software_noun time_noun series_noun analysis_noun model_noun testing_noun calibration_noun work_verb research_noun assistant_noun develop_verb software_noun time_noun series_noun analysis_noun model_noun testing_noun calibration_noun
indian_noun institute_noun technology_noun kharagpur_noun india_noun indian_noun institute_noun technology_noun kharagpur_noun india_noun
b.tech._noun naval_noun architecture_noun graduate_verb first_adjective class_noun honor_noun rank_verb first_adverb class_noun b.tech._noun naval_noun architecture_noun graduate_verb first_adjective class_noun honor_noun rank_verb first_adverb class_noun
relevant_adjective training_noun credit_noun risk_noun modeling_noun stanford_noun university_noun stanford_noun ca_noun relevant_adjective training_noun credit_noun risk_noun modeling_noun stanford_noun university_noun stanford_noun ca_noun
october_noun finance_noun accounting_noun executive_noun rice_noun university_noun houston_noun tx_noun october_noun finance_noun accounting_noun executive_noun rice_noun university_noun houston_noun tx_noun
august_noun training_noun modules_noun product_noun knowledge_noun structured_noun project_noun finance_noun securitization_noun credit_noun strategy_noun in-house_noun training_noun dc_noun gardener_noun euromoney_noun august_noun training_noun modules_noun product_noun knowledge_noun structured_noun project_noun finance_noun securitization_noun credit_noun strategy_noun in-house_noun training_noun dc_noun gardener_noun euromoney_noun
january_noun april_noun risk_noun risk_noun conference_noun washington_noun d.c._noun june_noun economics_noun supply_noun refining_noun marketing_noun stone_noun bond_noun corp._noun houston_noun tx_noun january_noun april_noun risk_noun risk_noun conference_noun washington_noun d.c._noun june_noun economics_noun supply_noun refining_noun marketing_noun stone_noun bond_noun corp._noun houston_noun tx_noun
april_noun understand_verb apply_verb financial_adjective mathematics_noun energy_noun derivative_noun efficient_adjective pricing_noun trading_noun risk_noun management_noun risk_noun conferences_noun new_noun york_noun ny_noun april_noun understand_verb apply_verb financial_adjective mathematics_noun energy_noun derivative_noun efficient_adjective pricing_noun trading_noun risk_noun management_noun risk_noun conferences_noun new_noun york_noun ny_noun
march_noun practical_noun strategic_noun application_noun var_noun energy_noun industries_noun risk_noun conferences_noun houston_noun tx_noun march_noun practical_noun strategic_noun application_noun var_noun energy_noun industries_noun risk_noun conferences_noun houston_noun tx_noun
december_noun financial_noun modeling_noun s-plus_noun mathsoft_noun new_noun york_noun ny_noun december_noun financial_noun modeling_noun s-plus_noun mathsoft_noun new_noun york_noun ny_noun
october_noun option_noun analytic_noun pricing_noun option_noun exotic_noun options_noun cibc_noun school_noun financial_noun products_noun houston_noun tx_noun october_noun option_noun analytic_noun pricing_noun option_noun exotic_noun options_noun cibc_noun school_noun financial_noun products_noun houston_noun tx_noun
september_noun fundamental_noun energy_noun basis_noun trading_noun princeton_noun energy_noun houston_noun tx_noun september_noun fundamental_noun energy_noun basis_noun trading_noun princeton_noun energy_noun houston_noun tx_noun
feb_noun energy_noun derivatives_noun price_noun risk_noun management_noun energy_noun institute_noun univ._noun houston_noun houston_noun tx_noun feb_noun energy_noun derivatives_noun price_noun risk_noun management_noun energy_noun institute_noun univ._noun houston_noun houston_noun tx_noun
january_noun april_noun latest_adjective development_noun advanced_noun mathematics_noun derivative_noun risk_noun conference_noun new_noun york_noun ny_noun january_noun april_noun latest_adjective development_noun advanced_noun mathematics_noun derivative_noun risk_noun conference_noun new_noun york_noun ny_noun
december_noun options_noun seminar_noun nymex_noun houston_noun tx_noun december_noun options_noun seminar_noun nymex_noun houston_noun tx_noun
october_noun october_noun
select_verb honors_noun activities_noun present_verb seminar_noun credit_noun risk_noun e&p_noun mezzanine_noun finance_noun global_noun association_noun risk_noun professional_noun houston_noun chapter_noun tx_noun select_verb honors_noun activities_noun present_verb seminar_noun credit_noun risk_noun e&p_noun mezzanine_noun finance_noun global_noun association_noun risk_noun professional_noun houston_noun chapter_noun tx_noun
june_noun special_adjective recognition_noun award_noun shell_noun oil_noun products_noun company_noun june_noun special_adjective recognition_noun award_noun shell_noun oil_noun products_noun company_noun
committee_noun membership_noun panelist_noun author_noun lecturer_noun publication_noun reviewer_noun etc._other committee_noun membership_noun panelist_noun author_noun lecturer_noun publication_noun reviewer_noun etc._other
asce_noun api_noun otc_noun asme_noun omae_noun etc._other asce_noun api_noun otc_noun asme_noun omae_noun etc._other
receive_verb omae_noun award_noun american_noun society_noun mechanical_adjective engineering_noun recognition_noun outstanding_adjective originality_noun significance_noun paper_noun title_verb development_noun reliability-based_adjective global_adjective design_noun equation_noun tension_noun leg_noun platforms_noun receive_verb omae_noun award_noun american_noun society_noun mechanical_adjective engineering_noun recognition_noun outstanding_adjective originality_noun significance_noun paper_noun title_verb development_noun reliability-based_adjective global_adjective design_noun equation_noun tension_noun leg_noun platforms_noun
short_adjective course_noun instructor_noun seminar_noun speaker_noun university_noun texas_noun austin_noun rice_noun university_noun university_noun houston_noun short_adjective course_noun instructor_noun seminar_noun speaker_noun university_noun texas_noun austin_noun rice_noun university_noun university_noun houston_noun
sea_noun grant_noun association_noun award_noun excellence_noun research_noun sea_noun grant_noun association_noun usa_noun sea_noun grant_noun association_noun award_noun excellence_noun research_noun sea_noun grant_noun association_noun usa_noun
institute_noun silver_noun medal_noun indian_noun institute_noun technology_noun kharagpur_noun india_noun institute_noun silver_noun medal_noun indian_noun institute_noun technology_noun kharagpur_noun india_noun
national_noun science_noun talent_noun search_verb scholarship_noun government_noun india_noun national_noun science_noun talent_noun search_verb scholarship_noun government_noun india_noun
personal_adjective data_noun date_noun birth_noun september_noun us_noun citizen_noun marry_verb one_noun child_noun personal_adjective data_noun date_noun birth_noun september_noun us_noun citizen_noun marry_verb one_noun child_noun
reference_noun available_adjective upon_other request_noun reference_noun available_adjective upon_other request_noun
document_noun properties_noun title_noun rabi_noun s_noun author_noun shell_noun chemical_noun company_noun template_noun normal_adjective last_adjective save_verb grady_adjective revision_noun number_noun application_noun microsoft_noun word_noun document_noun properties_noun title_noun rabi_noun s_noun author_noun shell_noun chemical_noun company_noun template_noun normal_adjective last_adjective save_verb grady_adjective revision_noun number_noun application_noun microsoft_noun word_noun
total_adjective editing_noun time_noun last_adjective print_verb create_verb last_adjective save_verb company_noun shell_noun chemical_noun company_noun total_adjective editing_noun time_noun last_adjective print_verb create_verb last_adjective save_verb company_noun shell_noun chemical_noun company_noun
meet_verb discuss_verb non-grid_adjective am_noun process_verb attendee_noun julia_noun lynn_noun steve_noun sheila_noun meet_verb discuss_verb non-grid_adjective am_noun process_verb attendee_noun julia_noun lynn_noun steve_noun sheila_noun
calendar_noun entry_noun appointment_noun calendar_noun entry_noun appointment_noun
description_noun meet_verb discuss_verb non-grid_adjective am_noun process_verb attendee_noun julia_noun lynn_noun steve_noun sheila_noun jerry_noun conference_noun room_noun description_noun meet_verb discuss_verb non-grid_adjective am_noun process_verb attendee_noun julia_noun lynn_noun steve_noun sheila_noun jerry_noun conference_noun room_noun
date_noun time_noun pm_noun pm_noun central_noun standard_noun time_noun date_noun time_noun pm_noun pm_noun central_noun standard_noun time_noun
detailed_adjective description_noun united_noun states_noun license_noun detailed_adjective description_noun united_noun states_noun license_noun
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment