Commit 3179d5fd by etcart

added comments and explanations

parent d78631fd
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <math.h>
/* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically
*/
#ifndef RIVSIZE
#define RIVSIZE 25000
#endif
#if RIVSIZE<0
#error "RIVSIZE must be a positive number (preferably a large positive)"
#endif
/* NONZeros macro defines the number of non-zero values that will be generated
* for any level one (barcode) RIV. 2 is simple and lightweight to begin
*/
#ifndef NONZEROS
#define NONZEROS 2
#endif
#if NONZEROS%2 || NONZEROS<1
#error "NONZEROS must be an even, greater than 0 number"
#endif
/* CACHESIZE macro defines the number of RIVs the system will cache.
* a larger cache means more memory consumption, but will also be significantly
* faster in aggregation and reading applications. doesn't affect systems
* that do not use lexpull/push
*/
#ifndef CACHESIZE
#define CACHESIZE 20
#endif
#if CACHESIZE<0
#error "CACHESIZE cannot be a negative number"
#endif
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation.
* specifically, a sparseRIV contains a pair of arrays,
* containing locations and values, where pairs are found in like array
* indices.
*/
typedef struct{
char name[100];
int *values;
int *locations;
size_t count;
unsigned int frequency;
float magnitude;
int boolean;
}sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector
* math, as comparisons and arithmetic between vectors are ideally
* performed between sparse and dense (hetero-arithmetic)
*/
typedef struct{
char name[100];
int* values;
int* frequency;
float magnitude;
int cached;
}denseRIV;
/*RIVKey, holds globally important data that should not be changed partway through
* first function call in the program should always be:
* RIVinit();
* this will set these variables, check for incompatible choices, and open up
* memory blocks which the system will use in the background
*/
struct RIVData{
int I2SThreshold;
int *h_tempBlock;
int tempSize;
int thing;
denseRIV RIVCache[CACHESIZE];
}static RIVKey;
/* RIVinit should be the first function called in any usage of this library
* it sets global variables that practically all functions will reference,
* it checks that your base parameters are valid, and allocates memory for
* the functions to use, so that we can move fast with rare allocations.
*/
void RIVInit();
/* RIVCleanup should always be called to close a RIV program. it frees
* blocks allocated by RIVinit, and dumps the cached data to appropriate lexicon files
*/
void RIVCleanup();
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does not automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't
* need to carry metadata
*/
sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
/* mapS2D expands a sparseRIV out to denseRIV values, filling array locations
* based on location-value pairs
*/
/* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file
* this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV.
*/
void makesparseLocations(unsigned char* word, int *seeds, size_t seedCount);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexpush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int fLexPush(denseRIV RIVout);
denseRIV fLexPull(FILE* lexWord);
/* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(unsigned char* word);
/* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering
*/
int* mapI2D(int *locations, size_t seedCount);
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount);
int cacheDump();
int* addI2D(int* destination, int* locations, size_t seedCount);
denseRIV denseAllocate();
void signalSecure(int signum, siginfo_t *si, void* arg);
/* begin definitions */
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp);
free(denseTemp);
return sparseOut;
}
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
sparseRIV sparseOut;
int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
sparseOut.count = 0;
int add = 1;
int found;
for(int i=0; i<valueCount; i++){
found = 0;
for(int j=0; j<sparseOut.count; j++){
if(implicit[i] == locationsTemp[j]){
valuesTemp[i] += add;
add *= -1;
found = 1;
}
}
if(!found){
locationsTemp[sparseOut.count] = implicit[i];
valuesTemp[sparseOut.count] = add;
sparseOut.count++;
add*= -1;
}
}
sparseOut.locations = malloc(2*sparseOut.count*sizeof(int));
sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, 2*sparseOut.count*sizeof(int));
return sparseOut;
}
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
return output;
}
void RIVInit(){
RIVKey.I2SThreshold = sqrt(RIVSIZE);
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
struct sigaction action;
action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO;
//for(int i=1; i<27; i++){
sigaction(11,&action,NULL);
//}
RIVKey.h_tempBlock = (int*)malloc(3*RIVSIZE*sizeof(int));
RIVKey.tempSize = 3*RIVSIZE;
RIVKey.thing = 0;
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE);
}
void RIVCleanup(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
free(RIVKey.h_tempBlock);
}
int wordtoSeed(unsigned char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE;
locations++;
*locations = rand()%RIVSIZE;
locations++;
}
return;
}
int fLexPush(denseRIV RIVout){
char pathString[200] = {0};
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "lexicon/%s", RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(float), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
fclose(lexWord);
free(RIVout.values);
free(temp.locations);
return 0;
}
denseRIV fLexPull(FILE* lexWord){
denseRIV output;
sparseRIV temp;
output.values = calloc( (RIVSIZE+1) ,sizeof(int));
output.frequency = output.values+RIVSIZE;
int diagnostic = 0;
fread(&temp.count, 1, sizeof(size_t), lexWord);
diagnostic += fread(&temp.frequency, 1, sizeof(int), lexWord);
diagnostic += fread(&(temp.magnitude), 1, sizeof(int), lexWord);
temp.locations = malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
diagnostic += fread(temp.locations, temp.count, sizeof(int), lexWord);
diagnostic += fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output.values, temp);
*(output.frequency) = temp.frequency;
output.magnitude = temp.magnitude;
free(temp.locations);
output.cached = 0;
return output;
}
void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
int cacheDump(){
int flag = 0;
denseRIV* cache_slider = RIVKey.RIVCache;
denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE;
while(cache_slider<cache_stop){
if((*cache_slider).cached){
flag += fLexPush(*cache_slider);
}
cache_slider++;
}
return flag;
}
denseRIV denseAllocate(){
/* allocates a 0 vector */
denseRIV output;
output.values = calloc(RIVSIZE+1, sizeof(int));
/* for compact memory use, frequency is placed immediately after values */
output.frequency = output.values+RIVSIZE;
output.magnitude = 0;
output.cached = 0;
return output;
}
/*TODO add a simplified free function*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <math.h>
/* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically
*/
#ifndef RIVSIZE
#define RIVSIZE 25000
#endif
#if RIVSIZE<0
#error "RIVSIZE must be a positive number (preferably a large positive)"
#endif
/* NONZeros macro defines the number of non-zero values that will be generated
* for any level one (barcode) RIV. 2 is simple and lightweight to begin
*/
#ifndef NONZEROS
#define NONZEROS 2
#endif
#if NONZEROS%2 || NONZEROS<1
#error "NONZEROS must be an even, greater than 0 number"
#endif
/* CACHESIZE macro defines the number of RIVs the system will cache.
* a larger cache means more memory consumption, but will also be significantly
* faster in aggregation and reading applications. doesn't affect systems
* that do not use lexpull/push
*/
#ifndef CACHESIZE
#define CACHESIZE 20
#endif
#if CACHESIZE<0
#error "CACHESIZE cannot be a negative number"
#endif
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation.
* specifically, a sparseRIV contains a pair of arrays,
* containing locations and values, where pairs are found in like array
* indices.
*/
typedef struct{
char name[100];
int *values;
int *locations;
size_t count;
unsigned int frequency;
float magnitude;
int boolean;
}sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector
* math, as comparisons and arithmetic between vectors are ideally
* performed between sparse and dense (hetero-arithmetic)
*/
typedef struct{
char name[100];
int* values;
int* frequency;
float magnitude;
int cached;
}denseRIV;
/*RIVKey, holds globally important data that should not be changed partway through
* first function call in the program should always be:
* RIVinit();
* this will set these variables, check for incompatible choices, and open up
* memory blocks which the system will use in the background
*/
struct RIVData{
int I2SThreshold;
int *h_tempBlock;
int tempSize;
int thing;
denseRIV RIVCache[CACHESIZE];
}static RIVKey;
/* RIVinit should be the first function called in any usage of this library
* it sets global variables that practically all functions will reference,
* it checks that your base parameters are valid, and allocates memory for
* the functions to use, so that we can move fast with rare allocations.
*/
void RIVInit();
/* RIVCleanup should always be called to close a RIV program. it frees
* blocks allocated by RIVinit, and dumps the cached data to appropriate lexicon files
*/
void RIVCleanup();
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does not automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't
* need to carry metadata
*/
sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
/* mapS2D expands a sparseRIV out to denseRIV values, filling array locations
* based on location-value pairs
*/
/* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file
* this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV.
*/
void makesparseLocations(unsigned char* word, int *seeds, size_t seedCount);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexpush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int fLexPush(denseRIV RIVout);
denseRIV fLexPull(FILE* lexWord);
/* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(unsigned char* word);
/* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering
*/
int* mapI2D(int *locations, size_t seedCount);
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount);
int cacheDump();
int* addI2D(int* destination, int* locations, size_t seedCount);
denseRIV denseAllocate();
void signalSecure(int signum, siginfo_t *si, void* arg);
/* begin definitions */
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp);
free(denseTemp);
return sparseOut;
}
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
sparseRIV sparseOut;
int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
sparseOut.count = 0;
int add = 1;
int found;
for(int i=0; i<valueCount; i++){
found = 0;
for(int j=0; j<sparseOut.count; j++){
if(implicit[i] == locationsTemp[j]){
valuesTemp[i] += add;
add *= -1;
found = 1;
}
}
if(!found){
locationsTemp[sparseOut.count] = implicit[i];
valuesTemp[sparseOut.count] = add;
sparseOut.count++;
add*= -1;
}
}
sparseOut.locations = malloc(2*sparseOut.count*sizeof(int));
sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, 2*sparseOut.count*sizeof(int));
return sparseOut;
}
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
return output;
}
void RIVInit(){
RIVKey.I2SThreshold = sqrt(RIVSIZE);
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
struct sigaction action;
action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO;
//for(int i=1; i<27; i++){
sigaction(11,&action,NULL);
//}
RIVKey.h_tempBlock = (int*)malloc(3*RIVSIZE*sizeof(int));
RIVKey.tempSize = 3*RIVSIZE;
RIVKey.thing = 0;
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE);
}
void RIVCleanup(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
free(RIVKey.h_tempBlock);
}
int wordtoSeed(unsigned char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE;
locations++;
*locations = rand()%RIVSIZE;
locations++;
}
return;
}
int fLexPush(denseRIV RIVout){
char pathString[200] = {0};
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "lexicon/%s", RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(float), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
fclose(lexWord);
free(RIVout.values);
free(temp.locations);
return 0;
}
denseRIV fLexPull(FILE* lexWord){
denseRIV output;
sparseRIV temp;
output.values = calloc( (RIVSIZE+1) ,sizeof(int));
output.frequency = output.values+RIVSIZE;
int diagnostic = 0;
fread(&temp.count, 1, sizeof(size_t), lexWord);
diagnostic += fread(&temp.frequency, 1, sizeof(int), lexWord);
diagnostic += fread(&(temp.magnitude), 1, sizeof(int), lexWord);
temp.locations = malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
diagnostic += fread(temp.locations, temp.count, sizeof(int), lexWord);
diagnostic += fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output.values, temp);
*(output.frequency) = temp.frequency;
output.magnitude = temp.magnitude;
free(temp.locations);
output.cached = 0;
return output;
}
void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
int cacheDump(){
int flag = 0;
denseRIV* cache_slider = RIVKey.RIVCache;
denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE;
while(cache_slider<cache_stop){
if((*cache_slider).cached){
flag += fLexPush(*cache_slider);
}
cache_slider++;
}
return flag;
}
denseRIV denseAllocate(){
/* allocates a 0 vector */
denseRIV output;
output.values = calloc(RIVSIZE+1, sizeof(int));
/* for compact memory use, frequency is placed immediately after values */
output.frequency = output.values+RIVSIZE;
output.magnitude = 0;
output.cached = 0;
return output;
}
/*TODO add a simplified free function*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <math.h>
/* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically
*/
#ifndef RIVSIZE
#define RIVSIZE 25000
#endif
/* NONZeros macro defines the number of non-zero values that will be generated
* for any level one (barcode) RIV. 2 is simple and lightweight to begin
*/
#ifndef NONZEROS
#define NONZEROS 2
#endif
/* CACHESIZE macro defines the number of RIVs the system will cache.
* a larger cache means more memory consumption, but will also be significantly
* faster in aggregation and reading applications. doesn't affect systems
* that do not use lexpull/push
*/
#ifndef CACHESIZE
#define CACHESIZE 20
#endif
#define CACHED 0x02
#define SPARSE 0x01
#define AVAILABLE 0x04
typedef struct{
char name[100];
int *values;
int *locations;
size_t count;
unsigned int* frequency;
float magnitude;
int cached;
int boolean;
int flags;
}RIV;
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation.
* specifically, a sparseRIV contains a pair of arrays,
* containing locations and values, where pairs are found in like array
* indices.
*/
typedef RIV sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector
* math, as comparisons and arithmetic between vectors are ideally
* performed between sparse and dense (hetero-arithmetic)
*/
typedef RIV denseRIV;
/*RIVKey, holds globally important data that should not be changed partway through
* first function call in the program should always be:
* RIVinit();
* this will set these variables, check for incompatible choices, and open up
* memory blocks which the system will use in the background
*/
struct RIVData{
size_t RIVsize;
int nonZeros;
int I2SThreshold;
int *h_tempBlock;
int tempSize;
int thing;
denseRIV* RIVCache;
int cacheSize;
}static RIVKey;
/* RIVinit should be the first function called in any usage of this library
* it sets global variables that practically all functions will reference,
* it checks that your base parameters are valid, and allocates memory for
* the functions to use, so that we can move fast with rare allocations.
*/
void RIVInit();
/* RIVCleanup should always be called to close a RIV program. it frees
* blocks allocated by RIVinit, and dumps the cached data to appropriate lexicon files
*/
void RIVCleanup();
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does not automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't
* need to carry metadata
*/
sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
/* mapS2D expands a sparseRIV out to denseRIV values, filling array locations
* based on location-value pairs
*/
/* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file
* this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV.
*/
void makesparseLocations(unsigned char* word, int *seeds, size_t seedCount);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexpush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int fLexPush(denseRIV RIVout);
denseRIV fLexPull(FILE* lexWord);
/* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(unsigned char* word);
/* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering
*/
int* mapI2D(int *locations, size_t seedCount);
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount);
int cacheDump();
int* addI2D(int* destination, int* locations, size_t seedCount);
denseRIV denseAllocate();
void signalSecure(int signum, siginfo_t *si, void* arg);
/* begin definitions */
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp);
free(denseTemp);
return sparseOut;
}
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
sparseRIV sparseOut;
int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
sparseOut.count = 0;
int add = 1;
int found;
for(int i=0; i<valueCount; i++){
found = 0;
for(int j=0; j<sparseOut.count; j++){
if(implicit[i] == locationsTemp[j]){
valuesTemp[i] += add;
add *= -1;
found = 1;
}
}
if(!found){
locationsTemp[sparseOut.count] = implicit[i];
valuesTemp[sparseOut.count] = add;
sparseOut.count++;
add*= -1;
}
}
sparseOut.locations = malloc(2*sparseOut.count*sizeof(int));
sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, 2*sparseOut.count*sizeof(int));
sparseOut.flags |= SPARSE;
return sparseOut;
}
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
output.flags |= SPARSE;
return output;
}
void RIVInit(){
RIVKey.I2SThreshold = sqrt(RIVSIZE);
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
struct sigaction action;
action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO;
//for(int i=1; i<27; i++){
sigaction(11,&action,NULL);
//}
RIVKey.h_tempBlock = (int*)malloc(3*RIVSIZE*sizeof(int));
RIVKey.tempSize = 3*RIVSIZE;
RIVKey.thing = 0;
RIVKey.cacheSize = CACHESIZE;
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
RIVKey.RIVCache = (denseRIV*)calloc(RIVKey.cacheSize,sizeof(denseRIV));
}
void RIVCleanup(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
#if CACHESIZE > 0
free(RIVKey.RIVCache);
#endif
free(RIVKey.h_tempBlock);
}
int wordtoSeed(unsigned char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE;
locations++;
*locations = rand()%RIVSIZE;
locations++;
}
return;
}
int fLexPush(denseRIV RIVout){
char pathString[200] = {0};
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "lexicon/%s", RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
fwrite(RIVout.frequency, 1, 4, lexWord);
fwrite(&RIVout.magnitude, 1, 4, lexWord);
fwrite(RIVout.values, RIVSIZE, 4, lexWord);
fclose(lexWord);
free(RIVout.values);
return 0;
}
denseRIV fLexPull(FILE* lexWord){
denseRIV output;
output.values = malloc( (RIVSIZE+1) *sizeof(int));
output.frequency = (unsigned int*)(output.values+RIVSIZE);
int diagnostic = 0;
diagnostic += fread(output.frequency, 1, sizeof(int), lexWord);
diagnostic += fread(&(output.magnitude), 1, sizeof(int), lexWord);
diagnostic += fread(output.values, RIVSIZE, sizeof(int), lexWord);
if(diagnostic != (RIVSIZE+2)){
output.magnitude = -1;
}
output.cached = 0;
return output;
}
void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
int cacheDump(){
int flag = 0;
denseRIV* cache_slider = RIVKey.RIVCache;
denseRIV* cache_stop = RIVKey.RIVCache+RIVKey.cacheSize;
while(cache_slider<cache_stop){
if((*cache_slider).cached){
flag += fLexPush(*cache_slider);
}
cache_slider++;
}
return flag;
}
denseRIV denseAllocate(){
/* allocates a 0 vector */
denseRIV output;
output.values = calloc(RIVSIZE+1, sizeof(int));
/* for compact memory use, frequency is placed immediately after values */
output.frequency = (unsigned int*)(output.values+RIVSIZE);
output.magnitude = 0;
output.cached = 0;
return output;
}
/*TODO add a simplified free function*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <math.h>
/* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically
*/
#ifndef RIVSIZE
#define RIVSIZE 25000
#endif
/* NONZeros macro defines the number of non-zero values that will be generated
* for any level one (barcode) RIV. 2 is simple and lightweight to begin
*/
#ifndef NONZEROS
#define NONZEROS 2
#endif
/* CACHESIZE macro defines the number of RIVs the system will cache.
* a larger cache means more memory consumption, but will also be significantly
* faster in aggregation and reading applications. doesn't affect systems
* that do not use lexpull/push
*/
#ifndef CACHESIZE
#define CACHESIZE 20
#endif
#define CACHED 0x02
#define SPARSE 0x01
#define AVAILABLE 0x04
typedef struct{
char name[100];
int *values;
int *locations;
size_t count;
unsigned int* frequency;
float magnitude;
int cached;
int boolean;
int flags;
}RIV;
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation.
* specifically, a sparseRIV contains a pair of arrays,
* containing locations and values, where pairs are found in like array
* indices.
*/
typedef RIV sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector
* math, as comparisons and arithmetic between vectors are ideally
* performed between sparse and dense (hetero-arithmetic)
*/
typedef RIV denseRIV;
/*RIVKey, holds globally important data that should not be changed partway through
* first function call in the program should always be:
* RIVinit();
* this will set these variables, check for incompatible choices, and open up
* memory blocks which the system will use in the background
*/
struct RIVData{
size_t RIVsize;
int nonZeros;
int I2SThreshold;
int *h_tempBlock;
int tempSize;
int thing;
denseRIV* RIVCache;
int cacheSize;
}static RIVKey;
/* RIVinit should be the first function called in any usage of this library
* it sets global variables that practically all functions will reference,
* it checks that your base parameters are valid, and allocates memory for
* the functions to use, so that we can move fast with rare allocations.
*/
void RIVInit();
/* RIVCleanup should always be called to close a RIV program. it frees
* blocks allocated by RIVinit, and dumps the cached data to appropriate lexicon files
*/
void RIVCleanup();
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does not automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't
* need to carry metadata
*/
sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
/* mapS2D expands a sparseRIV out to denseRIV values, filling array locations
* based on location-value pairs
*/
/* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file
* this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV.
*/
void makesparseLocations(unsigned char* word, int *seeds, size_t seedCount);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexpush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int fLexPush(denseRIV RIVout);
denseRIV fLexPull(FILE* lexWord);
/* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(unsigned char* word);
/* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering
*/
int* mapI2D(int *locations, size_t seedCount);
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount);
int cacheDump();
int* addI2D(int* destination, int* locations, size_t seedCount);
denseRIV denseAllocate();
void signalSecure(int signum, siginfo_t *si, void* arg);
/* begin definitions */
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp);
/* sparseOut is flagged as sparse in consolidate step */
free(denseTemp);
return sparseOut;
}
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
sparseRIV sparseOut;
int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
sparseOut.count = 0;
int add = 1;
int found;
for(int i=0; i<valueCount; i++){
found = 0;
for(int j=0; j<sparseOut.count; j++){
if(implicit[i] == locationsTemp[j]){
valuesTemp[i] += add;
add *= -1;
found = 1;
}
}
if(!found){
locationsTemp[sparseOut.count] = implicit[i];
valuesTemp[sparseOut.count] = add;
sparseOut.count++;
add*= -1;
}
}
sparseOut.locations = malloc(2*sparseOut.count*sizeof(int));
sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, 2*sparseOut.count*sizeof(int));
sparseOut.flags |= SPARSE;
return sparseOut;
}
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
output.flags |= SPARSE;
return output;
}
void RIVInit(){
RIVKey.I2SThreshold = sqrt(RIVSIZE);
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
struct sigaction action;
action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO;
//for(int i=1; i<27; i++){
sigaction(11,&action,NULL);
//}
RIVKey.h_tempBlock = (int*)malloc(3*RIVSIZE*sizeof(int));
RIVKey.tempSize = 3*RIVSIZE;
RIVKey.thing = 0;
RIVKey.cacheSize = CACHESIZE;
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
RIVKey.RIVCache = (denseRIV*)calloc(RIVKey.cacheSize,sizeof(denseRIV));
}
void RIVCleanup(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
#if CACHESIZE > 0
free(RIVKey.RIVCache);
#endif
free(RIVKey.h_tempBlock);
}
int wordtoSeed(unsigned char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE;
locations++;
*locations = rand()%RIVSIZE;
locations++;
}
return;
}
int fLexPush(denseRIV RIVout){
char pathString[200] = {0};
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "lexicon/%s", RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
fwrite(RIVout.frequency, 1, 4, lexWord);
fwrite(&RIVout.magnitude, 1, 4, lexWord);
fwrite(RIVout.values, RIVSIZE, 4, lexWord);
fclose(lexWord);
free(RIVout.values);
return 0;
}
denseRIV fLexPull(FILE* lexWord){
denseRIV output;
output.values = malloc( (RIVSIZE+1) *sizeof(int));
output.frequency = (unsigned int*)(output.values+RIVSIZE);
int diagnostic = 0;
diagnostic += fread(output.frequency, 1, sizeof(int), lexWord);
diagnostic += fread(&(output.magnitude), 1, sizeof(int), lexWord);
diagnostic += fread(output.values, RIVSIZE, sizeof(int), lexWord);
if(diagnostic != (RIVSIZE+2)){
output.magnitude = -1;
}
output.cached = 0;
output.flags &= ~SPARSE;
return output;
}
void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
int cacheDump(){
int flag = 0;
denseRIV* cache_slider = RIVKey.RIVCache;
denseRIV* cache_stop = RIVKey.RIVCache+RIVKey.cacheSize;
while(cache_slider<cache_stop){
if((*cache_slider).cached){
flag += fLexPush(*cache_slider);
}
cache_slider++;
}
return flag;
}
denseRIV denseAllocate(){
/* allocates a 0 vector */
denseRIV output;
output.values = calloc(RIVSIZE+1, sizeof(int));
/* for compact memory use, frequency is placed immediately after values */
output.frequency = (unsigned int*)(output.values+RIVSIZE);
output.magnitude = 0;
output.cached = 0;
output.flags &= ~SPARSE;
return output;
}
/*TODO add a simplified free function*/
No preview for this file type
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 5
#define CACHESIZE 0
#define THRESHOLD 0.70
#include "RIVtoolsCPUlinux.h"
void getcentroids(sparseRIV* centroids, sparseRIV* vectorSet, int centroidCount, int vectorCount);
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
RIVInit();
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
getMagnitudes(fileRIVs, fileCount);
clock_t beginnsquared = clock();
sparseRIV centroids[5];
strcpy(centroids[0].name, "boobs");
strcpy(centroids[1].name, "ass");
strcpy(centroids[2].name, "shit");
strcpy(centroids[3].name, "cocks");
strcpy(centroids[4].name, "fuck");
for(int i=0; i<5; i++){
centroids[i] = wordtoL2(centroids[i].name);
}
getMagnitudes(centroids, 5);
getcentroids(centroids, fileRIVs, 5, fileCount);
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("nsquared time:%lf\n\n", time);
printf("%d <", RIVKey.thing);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
free(fileRIVs);
return 0;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2Clean(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input);
(*fileCount)++;
}
}
}
void getcentroids(sparseRIV* centroids, sparseRIV* vectorSet, int centroidCount, int vectorCount){
float** cosines = malloc(centroidCount*sizeof(int*));
for(int i=0; i<centroidCount; i++){
cosines[i] = cosineCompare(centroids[i], vectorSet, vectorCount);
}
int* centroidIndexes[centroidCount];
int indexCounts[centroidCount];
int* denses[centroidCount];
*centroidIndexes = calloc(vectorCount*centroidCount, sizeof(int));
*denses = malloc(RIVKey.RIVsize*centroidCount * sizeof(int));
for(int i=1; i<centroidCount; i++){
centroidIndexes[i] = centroidIndexes[0]+i*vectorCount;
denses[i] = denses[0] +i*RIVKey.RIVsize;
}
float token = 2.0;
int counter = 0;
for(int i=0; i<vectorCount; i++){
token = 2.0;
printf("\nfor vector %d:\n", i);
for(int j = 0; j<centroidCount; j++){
printf("centroid %d: %f", j, cosines[j][i]);
if(fabsf(cosines[j][i])< token){
token = fabsf(cosines[j][i]);
counter = j;
}
}
centroidIndexes[counter][indexCounts[counter]] = i;
indexCounts[counter] += 1;
}
for(int i=0; i<centroidCount; i++){
memset(denses[i], 0, RIVKey.RIVsize);
printf("\n\nnumber %d\n", i);
for(int j=0; j<indexCounts[i]; i++){
addS2D(denses[i], vectorSet[j]);
for(int k=0; k<RIVKey.RIVsize; k++){
printf("%d, ", denses[i][k]);
}
}
}
}
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 5
#define CACHESIZE 0
#define THRESHOLD 0.70
#include "RIVtoolsCPUlinux.h"
void getcentroids(sparseRIV* centroids, sparseRIV* vectorSet, int centroidCount, int vectorCount);
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
RIVInit();
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
getMagnitudes(fileRIVs, fileCount);
clock_t beginnsquared = clock();
sparseRIV centroids[5];
strcpy(centroids[0].name, "boobs");
strcpy(centroids[1].name, "ass");
strcpy(centroids[2].name, "shit");
strcpy(centroids[3].name, "cocks");
strcpy(centroids[4].name, "fuck");
for(int i=0; i<5; i++){
centroids[i] = wordtoL2(centroids[i].name);
}
getMagnitudes(centroids, 5);
getcentroids(centroids, fileRIVs, 5, fileCount);
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("nsquared time:%lf\n\n", time);
printf("%d <", RIVKey.thing);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
free(fileRIVs);
return 0;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2Clean(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input);
(*fileCount)++;
}
}
}
void getcentroids(sparseRIV* centroids, sparseRIV* vectorSet, int centroidCount, int vectorCount){
float** cosines = malloc(centroidCount*sizeof(int*));
for(int i=0; i<centroidCount; i++){
cosines[i] = cosineCompare(centroids[i], vectorSet, vectorCount);
}
int* centroidIndexes[centroidCount];
int indexCounts[centroidCount];
int* denses[centroidCount];
*centroidIndexes = calloc(vectorCount*centroidCount, sizeof(int));
*denses = malloc(RIVKey.RIVsize*centroidCount * sizeof(int));
for(int i=1; i<centroidCount; i++){
centroidIndexes[i] = centroidIndexes[0]+i*vectorCount;
denses[i] = denses[0] +i*RIVKey.RIVsize;
}
float token = 2.0;
int counter = 0;
for(int i=0; i<vectorCount; i++){
token = 2.0;
printf("\nfor vector %d:\n", i);
for(int j = 0; j<centroidCount; j++){
printf("centroid %d: %f", j, cosines[j][i]);
if(fabsf(cosines[j][i])< token){
token = fabsf(cosines[j][i]);
counter = j;
}
}
centroidIndexes[counter][indexCounts[counter]] = i;
indexCounts[counter] += 1;
}
for(int i=0; i<centroidCount; i++){
memset(denses[i], 0, RIVKey.RIVsize);
printf("\n\nnumber %d\n", i);
for(int j=0; j<indexCounts[i]; i++){
addS2D(denses[i], vectorSet[j]);
for(int k=0; k<RIVKey.RIVsize; k++){
printf("%d, ", denses[i][k]);
}
}
}
}
No preview for this file type
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 5000
#define CACHESIZE 0
#define NONZEROS 2
#define THRESHOLD 0.7
#define COSINEACTION do {\
if(cosine > THRESHOLD){ \
printf("%s\t%s\n%f\n", baseRIV.name, (*multipliers).name, cosine);\
(*multipliers).boolean = 0; \
RIVKey.thing++; \
}\
}while(0)
#include "RIVtoolsMorphic.h"
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
RIVInit();
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
sparseRIV* fileRIVs_slider = fileRIVs;
sparseRIV* fileRIVs_stop = fileRIVs+fileCount;
while(fileRIVs_slider <fileRIVs_stop){
(*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider);
fileRIVs_slider++;
}
clock_t beginnsquared = clock();
float cosine;
float minmag;
float maxmag;
denseRIV baseDense;
baseDense.values = malloc(RIVSIZE*sizeof(int));
fileRIVs_slider = fileRIVs;
sparseRIV* comparators_slider;
while(fileRIVs_slider<fileRIVs_stop){
comparators_slider = fileRIVs;
memset(baseDense.values, 0, RIVSIZE*sizeof(int));
baseDense.values = addS2D(baseDense.values, *fileRIVs_slider);
baseDense.magnitude = (*fileRIVs_slider).magnitude;
minmag = baseDense.magnitude*.85;
maxmag = baseDense.magnitude*1.15;
while(comparators_slider < fileRIVs_slider){
if((*comparators_slider).magnitude < maxmag && (*comparators_slider).magnitude > minmag && (*comparators_slider).boolean){
cosine = cosCompare(baseDense, *comparators_slider);
if(cosine>THRESHOLD){
printf("%s\t%s\n%f\n", (*fileRIVs_slider).name , (*comparators_slider).name, cosine);
(*comparators_slider).boolean = 0;
RIVKey.thing++;
}
}
comparators_slider++;
//cosineCompare(fileRIVs[i], fileRIVs, i);
}
fileRIVs_slider++;
}
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("nsquared time:%lf\n\n", time);
printf("%d <", RIVKey.thing);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
free(fileRIVs);
return 0;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input);
(*fileCount)++;
}
}
}
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 5000
#define CACHESIZE 0
#define NONZEROS 2
#define THRESHOLD 0.7
#define COSINEACTION do {\
if(cosine > THRESHOLD){ \
printf("%s\t%s\n%f\n", baseRIV.name, (*multipliers).name, cosine);\
(*multipliers).boolean = 0; \
RIVKey.thing++; \
}\
}while(0)
#include "RIVtoolsMorphic.h"
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
RIVInit();
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
sparseRIV* fileRIVs_slider = fileRIVs;
sparseRIV* fileRIVs_stop = fileRIVs+fileCount;
while(fileRIVs_slider <fileRIVs_stop){
(*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider);
fileRIVs_slider++;
}
clock_t beginnsquared = clock();
float cosine;
float minmag;
float maxmag;
denseRIV baseDense;
baseDense.values = malloc(RIVSIZE*sizeof(int));
fileRIVs_slider = fileRIVs;
sparseRIV* comparators_slider;
while(fileRIVs_slider<fileRIVs_stop){
comparators_slider = fileRIVs;
memset(baseDense.values, 0, RIVSIZE*sizeof(int));
baseDense.values = addS2D(baseDense.values, *fileRIVs_slider);
baseDense.magnitude = (*fileRIVs_slider).magnitude;
minmag = baseDense.magnitude*.85;
maxmag = baseDense.magnitude*1.15;
while(comparators_slider < fileRIVs_slider){
if((*comparators_slider).magnitude < maxmag && (*comparators_slider).magnitude > minmag && (*comparators_slider).boolean){
cosine = cosCompare(baseDense, *comparators_slider);
if(cosine>THRESHOLD){
printf("%s\t%s\n%f\n", (*fileRIVs_slider).name , (*comparators_slider).name, cosine);
(*comparators_slider).boolean = 0;
RIVKey.thing++;
}
}
comparators_slider++;
//cosineCompare(fileRIVs[i], fileRIVs, i);
}
fileRIVs_slider++;
}
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("nsquared time:%lf\n\n", time);
printf("%d <", RIVKey.thing);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
free(fileRIVs);
return 0;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input);
(*fileCount)++;
}
}
}
No preview for this file type
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define CACHESIZE 100000
#include "RIVtoolsCPUlinux.h"
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
#include <error.h>
void fileGrind(FILE* textFile);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void directoryGrind(char *rootString);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
RIVInit();
char pathString[1000];
strcpy(pathString, argv[1]);
strcat(pathString, "/");
directoryGrind(pathString);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
RIVCleanup();
return 0;
}
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
denseRIV *denseSet_slider;
denseRIV *dense_stop = denseSet+RIVCount;
int *locations = additive.locations;
int *locations_stop = locations+additive.count;
int *values = additive.values;
//int *target;
while(locations<locations_stop){
denseSet_slider = denseSet;
while(denseSet_slider<dense_stop){
(*denseSet_slider).values[*locations]+= *values;
//*target+=*values;
denseSet_slider++;
}
locations++;
values++;
}
}
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
denseRIV* RIVStop = RIVSet+wordCount;
while(RIVSet<RIVStop){
if(!strcmp(word, RIVSet->name)){
return 1;
}
RIVSet++;
}
return 0;
}
void directoryGrind(char *rootString){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
while(!strcmp(files->d_name, ".") || !strcmp(files->d_name, "..")){
files = readdir(directory);
}
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryGrind(pathString);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
printf("%s\n", pathString);
FILE *input = fopen(pathString, "r+");
if(input){
fileGrind(input);
fclose(input);
}
}
}
void fileGrind(FILE* textFile){
sparseRIV aggregateRIV = fileToL2Clean(textFile);
fseek(textFile, 0, SEEK_SET);
int wordCount = 0;
denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
char word[200];
while(fscanf(textFile, "%99s", word)){
if(feof(textFile)) break;
if(!(*word))continue;
if(!isWordClean((char*)word)){
continue;
}
if(checkDupe(RIVArray, word, wordCount)){
continue;
}
RIVArray[wordCount] = lexPull(word);
if(!*((RIVArray[wordCount].name))) break;
int* thing = RIVArray[wordCount].frequency;
*thing = *thing + 1;
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
wordCount++;
}
//printf("%d\n", wordCount);
addS2Ds(RIVArray, aggregateRIV, wordCount);
denseRIV* RIVArray_slider = RIVArray;
denseRIV* RIVArray_stop = RIVArray+wordCount;
while(RIVArray_slider<RIVArray_stop){
lexPush(*RIVArray_slider);
RIVArray_slider++;
}
free(RIVArray);
free(aggregateRIV.locations);
//free(aggregateRIV.values);
}
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define CACHESIZE 100000
#include "RIVtoolsCPUlinux.h"
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
#include <error.h>
void fileGrind(FILE* textFile);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void directoryGrind(char *rootString);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
RIVInit();
char pathString[1000];
strcpy(pathString, argv[1]);
strcat(pathString, "/");
directoryGrind(pathString);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
RIVCleanup();
return 0;
}
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
denseRIV *denseSet_slider;
denseRIV *dense_stop = denseSet+RIVCount;
int *locations = additive.locations;
int *locations_stop = locations+additive.count;
int *values = additive.values;
//int *target;
while(locations<locations_stop){
denseSet_slider = denseSet;
while(denseSet_slider<dense_stop){
(*denseSet_slider).values[*locations]+= *values;
//*target+=*values;
denseSet_slider++;
}
locations++;
values++;
}
}
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
denseRIV* RIVStop = RIVSet+wordCount;
while(RIVSet<RIVStop){
if(!strcmp(word, RIVSet->name)){
return 1;
}
RIVSet++;
}
return 0;
}
void directoryGrind(char *rootString){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
while(!strcmp(files->d_name, ".") || !strcmp(files->d_name, "..")){
files = readdir(directory);
}
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryGrind(pathString);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
printf("%s\n", pathString);
FILE *input = fopen(pathString, "r+");
if(input){
fileGrind(input);
fclose(input);
}
}
}
void fileGrind(FILE* textFile){
sparseRIV aggregateRIV = fileToL2Clean(textFile);
fseek(textFile, 0, SEEK_SET);
int wordCount = 0;
denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
char word[200];
while(fscanf(textFile, "%99s", word)){
if(feof(textFile)) break;
if(!(*word))continue;
if(!isWordClean((char*)word)){
continue;
}
if(checkDupe(RIVArray, word, wordCount)){
continue;
}
RIVArray[wordCount] = lexPull(word);
if(!*((RIVArray[wordCount].name))) break;
int* thing = RIVArray[wordCount].frequency;
*thing = *thing + 1;
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
wordCount++;
}
//printf("%d\n", wordCount);
addS2Ds(RIVArray, aggregateRIV, wordCount);
denseRIV* RIVArray_slider = RIVArray;
denseRIV* RIVArray_stop = RIVArray+wordCount;
while(RIVArray_slider<RIVArray_stop){
lexPush(*RIVArray_slider);
RIVArray_slider++;
}
free(RIVArray);
free(aggregateRIV.locations);
//free(aggregateRIV.values);
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "RIVLower.h"
#include "RIVaccessories.h"
/* lexPush writes a denseRIV to a file for permanent storage */
int lexPush(denseRIV RIVout);
/* lexPull reads an existing lexicon entry (under directory "lexicon")
* and creates a denseRIV with those attributes.
* if the file does not exist, it creates a 0 vector with the name of word
*/
denseRIV lexPull(char* word);
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each
* word contained
*/
sparseRIV fileToL2(FILE *input);
/* fileToL2Clean operates the same as fileToL2 butkeeps only words
* containing lowercase letters and the '_' symbol
* this is important if you will be lexPush-ing those words later
*/
sparseRIV fileToL2Clean(FILE *data);
sparseRIV fileToL2direct(FILE *data);
/*cosine determines the "similarity" between two RIVs. */
float cosCompare(denseRIV baseRIV, sparseRIV comparator);
sparseRIV wordtoL2(char* word);
sparseRIV consolidateI2S(int *implicit, size_t valueCount);
sparseRIV text2L2(char *text);
sparseRIV text2L2(char *text){
unsigned int blockSize;
char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
int displacement;
while(sscanf(text, "%99s%n", word, &displacement)){
text += displacement+1;
if(!displacement){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations((unsigned char*)word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2(FILE *data){
unsigned int blockSize;
unsigned char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2Clean(FILE *data){
unsigned char word[100] = {0};
int *locations = RIVKey.h_tempBlock;
unsigned int blockSize;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
/* if the word is not clean, skip it */
if(!isWordClean((char*)word)){
continue;
}
blockSize = locationCount+NONZEROS;
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount>RIVKey.I2SThreshold){
return consolidateI2SIndirect(implicit, valueCount);
}else{
return consolidateI2SDirect(implicit, valueCount);
}
}
void aggregateWord2D(denseRIV destination, char* word){
//makeSparseLocations((unsigned char*)word, locationSlot, 0);
srand(wordtoSeed((unsigned char*)word));
for(int i=0; i<NONZEROS; i++){
destination.values[(rand()%RIVSIZE)] +=1;
destination.values[(rand()%RIVSIZE)] -= 1;
}
}
float cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0;
int *values = comparator.values;
int *locations = comparator.locations;
int *locations_Stop = locations+comparator.count;
while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine */
dot += (*values)*(*(baseRIV.values+(*locations)));
locations++;
values++;
}
float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
return cosine;
}
float getMagnitudeSparse(sparseRIV input){
unsigned long long int temp = 0;
int *values = input.values;
int *values_stop = values+input.count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
float magnitude = sqrt(temp);
input.magnitude = magnitude;
return magnitude;
}
denseRIV lexPull(char* word){
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
srand(wordtoSeed((unsigned char*)word));
int hash = rand()%CACHESIZE;
if(!strcmp(word, RIVKey.RIVCache[hash].name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
#endif /* CACHESIZE > 0 */
denseRIV output;
char pathString[200];
sprintf(pathString, "lexicon/%s", word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector */
output = denseAllocate();
}
strcpy(output.name, word);
return output;
}
int lexPush(denseRIV RIVout){
//printf("%s\n", (*RIVout).name);
#if CACHESIZE == 0
fLexPush(RIVout);
return 0;
#else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated to the cache
* or the RIV was pushed out from under it, in which case it has already been pushed*/
if(RIVout.cached){
return 0;
}
srand(wordtoSeed((unsigned char*)RIVout.name));
int hash = rand()%CACHESIZE;
if(!RIVKey.RIVCache[hash].cached){
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return 0;
/*if the current RIV is more frequent than the RIV holding it's slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
//scanf("%f", &(*RIVout).magnitude);
//printf("%s replacing %s\n", (*RIVout).name, RIVKey.RIVCache[hash].name);
/* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]);
/* replace the cache entry with the currrent RIV */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return diag;
}else{
/* push current RIV to file */
fLexPush(RIVout);
}
return 0;
#endif /* CACHESIZE == 0 */
}
sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0};
denseRIV denseTemp;
// a temporary dense RIV is stored in the tempBlock
denseTemp.values = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
int count = 0;
while(fscanf(data, "%99s", word)){
count++;
if(feof(data)){
break;
}
if(!(*word)){
break;
}
// add word's L1 RIV to the accumulating implicit RIV
aggregateWord2D(denseTemp, (char*)word);
}
sparseRIV output = consolidateD2S(denseTemp.values);
// frequency records the number of words in this file
output.frequency = count;
output.boolean = 1;
return output;
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "RIVLower.h"
#include "RIVaccessories.h"
/* RIV stands for Random Index Vector, referring to the method of generating
* the basic vectors that correspond to each word. each word has an algorithmically
* generated vector which represents it in this mathematical model, such that a word
* will produce the same vector each time it is encountered*[1]. this base
* vector will be referred to as a L1 vector or a barcode vector
*
* by summing these vectors, we can get a mathematical representation of
* a set of text. this summed vector will be referred to as an L2 vector
* or aggregate vector. in its simplest implimentation, an L2 vector
* representation of a document contains a model of the contents of the
* document, enabling us to compare direction and magnitude of document
* vectors to understand their relationships to each other.
*
* but the system we are really interested in is the ability to form
* context vectors
* a context vector is the sum of all (L1?) vectors that the word
* has been encountered in context with. from these context vectors
* certain patterns and relationships between words should emerge.
* what patterns? that is the key question we will try to answer
*
* [1] a word produces the same vector each time it is encountered only
* if the environment is the same, ie. RIVs are the same dimensionality
* nonzero count is the same. comparing vectors produced in different
* environments yields meaningless drivel and should be avoided
*
* [2] what exactly "context" means remains a major stumbling point.
* paragraphs? sentences? some potential analyses would expect a static
* sized context (the nearest 10 words?) in order to be sensible, but
* it may be that some other definition of context is the most valid for
* this model. we will have to find out.
*
* some notes:
*
* -sparseRIV vs. denseRIV (sparse vector vs. dense vector)
* the two primary data structures we will use to analyze these vectors
* each vector type is packed with some metadata
* (name, magnitude, frequency, flags)
*
* -denseRIV is a standard vector representation.
* each array index corresponds to a dimension
* each value corresponds to a measurement in that dimension
*
* -sparseRIV is vector representation optimized for largely empty vectors
* each data point is a location/value pair where the
* location represents array index
* value represents value in that array index
*
* if we have a sparsely populated dense vector (mostly 0s) such as:
*
* |0|0|5|0|0|0|0|0|4|0|
*
* there are only 2 values in a ten element array. this could, instead
* be represented as
*
* |2|8| array indexes
* |5|4| array values
* |2| record of size
*
* and so, a 10 element vector has been represented in only 5 integers
*
* this is important for memory use, of course, but also for rapid calculations
* if we have two vectors
*
* |0|0|5|0|0|0|0|0|4|0|
* |0|0|0|0|0|0|7|0|3|-2|
* and we wish to perform the dot product this will take 10 steps,
* 9 of which are either 0*0 = 0, or 0*x = 0
* if we instead have these represented as sparse vectors
* |2|8|
* |5|4|
* |2|
*
* |6|8|9|
* |7|3|-2|
* |3|
*
* we only need to search for matching location values
* or, better yet, if we use a hybrid analysis:
* |0|0|5|0|0|0|0|0|4|0|
* ___________/__/_/
* / / /
* |6|8|9|
* |7|3|-2|
* |3|
* we can simply access the dense vector by indexes held in the sparse vector
* reducing this operation to only 3 steps
/* lexPush writes a denseRIV to a file for permanent storage */
int lexPush(denseRIV RIVout);
/* lexPull reads an existing lexicon entry (under directory "lexicon")
* and creates a denseRIV with those attributes.
* if the file does not exist, it creates a 0 vector with the name of word
*/
denseRIV lexPull(char* word);
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each
* word contained
*/
sparseRIV fileToL2(FILE *input);
/* fileToL2Clean operates the same as fileToL2 butkeeps only words
* containing lowercase letters and the '_' symbol
* this is important if you will be lexPush-ing those words later
*/
sparseRIV fileToL2Clean(FILE *data);
/*filetoL2direct is an experiment in simplifying the process. it's slow */
sparseRIV fileToL2direct(FILE *data);
/*cosine determines the "similarity" between two RIVs. */
float cosCompare(denseRIV baseRIV, sparseRIV comparator);
/*currently unused */
sparseRIV wordtoL2(char* word);
/* converts an implicit RIV (a set of unvalued locations) into a formal
* sparse RIV. this chooses the best method to perform the consolidation
* and launches that function */
sparseRIV consolidateI2S(int *implicit, size_t valueCount);
/* like fileToL2 but takes a block of text */
sparseRIV text2L2(char *text);
sparseRIV text2L2(char *text){
unsigned int blockSize;
char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
int displacement;
while(sscanf(text, "%99s%n", word, &displacement)){
text += displacement+1;
if(!displacement){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations((unsigned char*)word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file, untill frequency
* is needed to hold some more useful data point */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2(FILE *data){
unsigned int blockSize;
unsigned char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2Clean(FILE *data){
unsigned char word[100] = {0};
int *locations = RIVKey.h_tempBlock;
unsigned int blockSize;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
/* if the word is not clean, skip it */
if(!isWordClean((char*)word)){
continue;
}
blockSize = locationCount+NONZEROS;
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount<RIVKey.I2SThreshold){
/* direct method is faster on small datasets, but has geometric scaling on large datasets */
return consolidateI2SDirect(implicit, valueCount);
}else{
/* optimized for large datasets */
return consolidateI2SIndirect(implicit, valueCount);
}
}
void aggregateWord2D(denseRIV destination, char* word){
srand(wordtoSeed((unsigned char*)word));
for(int i=0; i<NONZEROS; i++){
destination.values[(rand()%RIVSIZE)] +=1;
destination.values[(rand()%RIVSIZE)] -= 1;
}
}
float cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0;
int *values = comparator.values;
int *locations = comparator.locations;
int *locations_Stop = locations+comparator.count;
while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine
* comparing sparse to dense by index*/
dot += (*values)*(*(baseRIV.values+(*locations)));
locations++;
values++;
}
/*dot divided by product of magnitudes */
float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
return cosine;
}
float getMagnitudeSparse(sparseRIV input){
unsigned long long int temp = 0;
int *values = input.values;
int *values_stop = values+input.count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
float magnitude = sqrt(temp);
input.magnitude = magnitude;
return magnitude;
}
denseRIV lexPull(char* word){
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
srand(wordtoSeed((unsigned char*)word));
int hash = rand()%CACHESIZE;
if(!strcmp(word, RIVKey.RIVCache[hash].name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
#endif /* CACHESIZE > 0 */
/* if not, attempt to pull the word data from lexicon file */
denseRIV output;
char pathString[200];
sprintf(pathString, "lexicon/%s", word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons?
output = denseAllocate();
}
strcpy(output.name, word);
return output;
}
int lexPush(denseRIV RIVout){
#if CACHESIZE == 0
/* if there is no cache, simply push to file */
fLexPush(RIVout);
return 0;
#else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated
* to the cache or the RIV was pushed out from under it,
* in which case it has already been pushed! move on*/
if(RIVout.cached){
return 0;
}
srand(wordtoSeed((unsigned char*)RIVout.name));
int hash = rand()%CACHESIZE;
if(!RIVKey.RIVCache[hash].cached){
/* if there is no word in this cache slot, push to cache instead of file */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return 0;
/*if the current RIV is more frequent than the RIV holding its slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
/* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]);
/* push the current RIV to cache */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return diag;
}else{
/* push current RIV to file */
fLexPush(RIVout);
}
return 0;
#endif /* CACHESIZE == 0 */
}
sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0};
denseRIV denseTemp;
// a temporary dense RIV is stored in the tempBlock
denseTemp.values = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
int count = 0;
while(fscanf(data, "%99s", word)){
count++;
if(feof(data)){
break;
}
if(!(*word)){
break;
}
// add word's L1 RIV to the accumulating implicit RIV
aggregateWord2D(denseTemp, (char*)word);
}
sparseRIV output = consolidateD2S(denseTemp.values);
// frequency records the number of words in this file
output.frequency = count;
output.boolean = 1;
return output;
}
#include <stdio.h>
#include <stdlib.h>
#include <strsafe.h>
#define SEEDMASK 25214903917
struct RIVData{
int RIVsize;
int nonZeros;
long long int *masks;
int *h_tempBlock;
int *h_stagingBlock;
int *h_staging_slider;
int *h_staging_stop;
int *h_displacements;
int *d_OpenSlot;
int *d_SlotEnd;
float *d_magnitudes;
int thing;
}RIVKeyData;
typedef struct{
char name[100];
int *values;
int *locations;
int count;
int frequency;
float magnitude;
int boolean;
}sparseRIV;
sparseRIV FileToL2(FILE *data);
void consolidateD2S(sparseRIV *destination, int *denseInput);
void setKeyData(int RIVsize, int nonZeros, int blockSize);
int* mapS2D(int * destination, sparseRIV input);
int* makeSparseLocations(int *seeds, int seedCount);
void makeSeeds(unsigned char* word, int **seeds, int *seedCount);
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold);
void getMagnitudes(sparseRIV *inputs, int RIVCount);
int *mapI2D(int *locations, int seedCount);
sparseRIV text2L2(unsigned char *text);
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);
sparseRIV FileToL2(FILE *data){
unsigned char *word = (unsigned char*)calloc(2000, 1);
int *seeds = RIVKeyData.h_tempBlock;
int seedCount = 0;
while(fscanf(data, "%s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
makeSeeds(word, &seeds, &seedCount);
memset(word, 0, 2000);
}
int *locations = makeSparseLocations(seeds, seedCount);
//printf("mcshittles");
int *L2dense;
L2dense = mapI2D(locations, seedCount);
sparseRIV output;
//printf("tits");
consolidateD2S( &output, L2dense);
free(L2dense);
output.boolean = 1;
RIVKeyData.thing++;
return output;
}
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold){
int *baseDenseRIV = RIVKeyData.h_tempBlock;
mapS2D(baseDenseRIV, baseRIV);
float *outputs = (float*)malloc((multiplierCount)* sizeof(float));
float *output_slider = outputs;
sparseRIV *multipliersStop = multipliers+multiplierCount;
float minsize = baseRIV.magnitude * .75;
float maxsize = baseRIV.magnitude * 1.25;
while(multipliers<multipliersStop){
if(((*multipliers).boolean) /*&& (((*multipliers).magnitude < maxsize) && ((*multipliers).magnitude > minsize))*/){
int dot = 0;
int *values = (*multipliers).values;
int *locations = (*multipliers).locations;
int *locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){
dot += (*values)*(*(baseDenseRIV+(*locations)));
locations++;
values++;
}
*output_slider= dot/((baseRIV.magnitude)*((*multipliers).magnitude));
if(*output_slider>=threshold){
printf("%s\t%s\n%f\n", (*multipliers).name, baseRIV.name, *output_slider);
(*multipliers).boolean = 0;
//RIVKeyData.thing ++;
}
}
multipliers++;
output_slider++;
}
return outputs;
}
void getMagnitudes(sparseRIV *inputs, int RIVCount){
for(int i=0; i<RIVCount; i++){
int temp = 0;
int *values = inputs[i].values;
int *values_stop = values+inputs[i].count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
float magnitude = sqrt(temp);
inputs[i].magnitude = magnitude;
//printf("magnitude = %f, \n", magnitude);
}
}
int* mapS2D(int* destination, sparseRIV input){
memset(destination, 0, RIVKeyData.RIVsize*sizeof(int));
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
while(locations_slider<locations_stop){
destination[*locations_slider] = *values_slider;
locations_slider++;
values_slider++;
}
//HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_OpenSlot, destination, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyHostToDevice));
return destination;
}
int* mapI2D(int *locations, int valueCount){
int *destination = (int*)calloc(RIVKeyData.RIVsize,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
int value = 1;
while(locations_slider<locations_stop){
destination[*locations_slider] +=value;
locations_slider++;
value = (value == 1)? -1: 1;
}
return destination;
}
void consolidateD2S(sparseRIV *destination, int *denseInput){
int count = 0;
(*destination).locations = (int*) malloc(RIVKeyData.RIVsize*sizeof(int));
(*destination).values = (int*) malloc(RIVKeyData.RIVsize*sizeof(int));
for(int i=0; i<RIVKeyData.RIVsize; i++){
if(denseInput[i]){
(*destination).locations[count] = i;
(*destination).values[count] = denseInput[i];
count++;
}
}
destination->count = count;
(*destination).locations = (int*) realloc((*destination).locations, (*destination).count*sizeof(int));
(*destination).values = (int*) realloc((*destination).values, (*destination).count*sizeof(int));
}
void setKeyData(int RIVsize, int nonZeros, int blockSize){
RIVKeyData.RIVsize = RIVsize;
if(nonZeros%2){
printf("your nonZeros must be an even number");
nonZeros++;
printf(", changed to %d", nonZeros);
}
RIVKeyData.nonZeros = nonZeros;
RIVKeyData.masks = (long long int*)malloc(nonZeros*sizeof(long long int));
for(int i = 0; i<nonZeros; i++){
RIVKeyData.masks[i] = SEEDMASK>>(5*i);
}
RIVKeyData.h_tempBlock = (int*)malloc(blockSize*sizeof(int));
//RIVKeyData.h_stagingBlock = (int*)malloc(blockSize*sizeof(int));
//RIVKeyData.h_staging_slider = RIVKeyData.h_stagingBlock;
RIVKeyData.thing = 0;
}
void makeSeeds(unsigned char* word, int **seeds, int *seedCount){
int i=0;
int seedbase = 0;
while(*word){
seedbase += (*(word))<<(i*5);
word++;
i++;
}
int *seedTrack = (*seeds)+*seedCount;
for(i =0 ; i<RIVKeyData.nonZeros; i++){
*seedTrack = (seedbase>>i)+(3*i);
seedTrack++;
}
*seedCount+=RIVKeyData.nonZeros;
return;
}
int* makeSparseLocations(int* seeds, int seedCount){
int *locations = RIVKeyData.h_tempBlock;
int *locations_slider = locations;
int *seeds_stop = seeds+seedCount;
long long int *mask = RIVKeyData.masks;
long long int *mask_stop = mask+RIVKeyData.nonZeros;
while(seeds<seeds_stop){
*locations_slider =(((*seeds)^(*mask)) & 2147483647) %(RIVKeyData.RIVsize);
mask++;
locations_slider++;
seeds++;
if(!(mask<mask_stop)) mask-=RIVKeyData.nonZeros;
}
return locations;
}
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word){
unsigned char *word_slider = word;
while(*(*string)){
if(*(*string) == ' ') {
(*string)++;
break;
}
*word_slider = *(*string);
word_slider++;
(*string)++;
}
*word_slider = 0;
return word;
}
sparseRIV text2L2(unsigned char *text){
unsigned char *word = (unsigned char*)calloc(2000, 1);
int *seeds = ( int*)malloc(RIVKeyData.nonZeros*sizeof( int));
unsigned char *text_slider = text;
int seedCount = 0;
while(*text_slider){
sscanAdvance(&text_slider, word);
makeSeeds(word, &seeds, &seedCount);
memset(word, 0, 2000);
}
int *locations = makeSparseLocations(seeds, seedCount);
int *L2dense;
L2dense = mapI2D(locations, seedCount);
free(locations);
sparseRIV output;
consolidateD2S(&output, L2dense);
free(seeds);
return output;
}
#include <stdio.h>
#include <stdlib.h>
#include <strsafe.h>
#define SEEDMASK 25214903917
struct RIVData{
int RIVsize;
int nonZeros;
long long int *masks;
int *h_tempBlock;
int *h_stagingBlock;
int *h_staging_slider;
int *h_staging_stop;
int *h_displacements;
int *d_OpenSlot;
int *d_SlotEnd;
float *d_magnitudes;
int thing;
}RIVKeyData;
typedef struct{
char name[100];
int *values;
int *locations;
int count;
int frequency;
float magnitude;
int boolean;
}sparseRIV;
sparseRIV FileToL2(FILE *data);
void consolidateD2S(sparseRIV *destination, int *denseInput);
void setKeyData(int RIVsize, int nonZeros, int blockSize);
int* mapS2D(int * destination, sparseRIV input);
int* makeSparseLocations(int *seeds, int seedCount);
void makeSeeds(unsigned char* word, int **seeds, int *seedCount);
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold);
void getMagnitudes(sparseRIV *inputs, int RIVCount);
int *mapI2D(int *locations, int seedCount);
sparseRIV text2L2(unsigned char *text);
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);
sparseRIV FileToL2(FILE *data){
unsigned char *word = (unsigned char*)calloc(2000, 1);
int *seeds = RIVKeyData.h_tempBlock;
int seedCount = 0;
while(fscanf(data, "%s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
makeSeeds(word, &seeds, &seedCount);
memset(word, 0, 2000);
}
int *locations = makeSparseLocations(seeds, seedCount);
//printf("mcshittles");
int *L2dense;
L2dense = mapI2D(locations, seedCount);
sparseRIV output;
//printf("tits");
consolidateD2S( &output, L2dense);
free(L2dense);
output.boolean = 1;
RIVKeyData.thing++;
return output;
}
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold){
int *baseDenseRIV = RIVKeyData.h_tempBlock;
mapS2D(baseDenseRIV, baseRIV);
float *outputs = (float*)malloc((multiplierCount)* sizeof(float));
float *output_slider = outputs;
sparseRIV *multipliersStop = multipliers+multiplierCount;
float minsize = baseRIV.magnitude * .75;
float maxsize = baseRIV.magnitude * 1.25;
while(multipliers<multipliersStop){
if(((*multipliers).boolean) /*&& (((*multipliers).magnitude < maxsize) && ((*multipliers).magnitude > minsize))*/){
int dot = 0;
int *values = (*multipliers).values;
int *locations = (*multipliers).locations;
int *locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){
dot += (*values)*(*(baseDenseRIV+(*locations)));
locations++;
values++;
}
*output_slider= dot/((baseRIV.magnitude)*((*multipliers).magnitude));
if(*output_slider>=threshold){
printf("%s\t%s\n%f\n", (*multipliers).name, baseRIV.name, *output_slider);
(*multipliers).boolean = 0;
//RIVKeyData.thing ++;
}
}
multipliers++;
output_slider++;
}
return outputs;
}
void getMagnitudes(sparseRIV *inputs, int RIVCount){
for(int i=0; i<RIVCount; i++){
int temp = 0;
int *values = inputs[i].values;
int *values_stop = values+inputs[i].count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
float magnitude = sqrt(temp);
inputs[i].magnitude = magnitude;
//printf("magnitude = %f, \n", magnitude);
}
}
int* mapS2D(int* destination, sparseRIV input){
memset(destination, 0, RIVKeyData.RIVsize*sizeof(int));
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
while(locations_slider<locations_stop){
destination[*locations_slider] = *values_slider;
locations_slider++;
values_slider++;
}
//HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_OpenSlot, destination, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyHostToDevice));
return destination;
}
int* mapI2D(int *locations, int valueCount){
int *destination = (int*)calloc(RIVKeyData.RIVsize,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
int value = 1;
while(locations_slider<locations_stop){
destination[*locations_slider] +=value;
locations_slider++;
value = (value == 1)? -1: 1;
}
return destination;
}
void consolidateD2S(sparseRIV *destination, int *denseInput){
int count = 0;
(*destination).locations = (int*) malloc(RIVKeyData.RIVsize*sizeof(int));
(*destination).values = (int*) malloc(RIVKeyData.RIVsize*sizeof(int));
for(int i=0; i<RIVKeyData.RIVsize; i++){
if(denseInput[i]){
(*destination).locations[count] = i;
(*destination).values[count] = denseInput[i];
count++;
}
}
destination->count = count;
(*destination).locations = (int*) realloc((*destination).locations, (*destination).count*sizeof(int));
(*destination).values = (int*) realloc((*destination).values, (*destination).count*sizeof(int));
}
void setKeyData(int RIVsize, int nonZeros, int blockSize){
RIVKeyData.RIVsize = RIVsize;
if(nonZeros%2){
printf("your nonZeros must be an even number");
nonZeros++;
printf(", changed to %d", nonZeros);
}
RIVKeyData.nonZeros = nonZeros;
RIVKeyData.masks = (long long int*)malloc(nonZeros*sizeof(long long int));
for(int i = 0; i<nonZeros; i++){
RIVKeyData.masks[i] = SEEDMASK>>(5*i);
}
RIVKeyData.h_tempBlock = (int*)malloc(blockSize*sizeof(int));
//RIVKeyData.h_stagingBlock = (int*)malloc(blockSize*sizeof(int));
//RIVKeyData.h_staging_slider = RIVKeyData.h_stagingBlock;
RIVKeyData.thing = 0;
}
void makeSeeds(unsigned char* word, int **seeds, int *seedCount){
int i=0;
int seedbase = 0;
while(*word){
seedbase += (*(word))<<(i*5);
word++;
i++;
}
int *seedTrack = (*seeds)+*seedCount;
for(i =0 ; i<RIVKeyData.nonZeros; i++){
*seedTrack = (seedbase>>i)+(3*i);
seedTrack++;
}
*seedCount+=RIVKeyData.nonZeros;
return;
}
int* makeSparseLocations(int* seeds, int seedCount){
int *locations = RIVKeyData.h_tempBlock;
int *locations_slider = locations;
int *seeds_stop = seeds+seedCount;
long long int *mask = RIVKeyData.masks;
long long int *mask_stop = mask+RIVKeyData.nonZeros;
while(seeds<seeds_stop){
*locations_slider =(((*seeds)^(*mask)) & 2147483647) %(RIVKeyData.RIVsize);
mask++;
locations_slider++;
seeds++;
if(!(mask<mask_stop)) mask-=RIVKeyData.nonZeros;
}
return locations;
}
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word){
unsigned char *word_slider = word;
while(*(*string)){
if(*(*string) == ' ') {
(*string)++;
break;
}
*word_slider = *(*string);
word_slider++;
(*string)++;
}
*word_slider = 0;
return word;
}
sparseRIV text2L2(unsigned char *text){
unsigned char *word = (unsigned char*)calloc(2000, 1);
int *seeds = ( int*)malloc(RIVKeyData.nonZeros*sizeof( int));
unsigned char *text_slider = text;
int seedCount = 0;
while(*text_slider){
sscanAdvance(&text_slider, word);
makeSeeds(word, &seeds, &seedCount);
memset(word, 0, 2000);
}
int *locations = makeSparseLocations(seeds, seedCount);
int *L2dense;
L2dense = mapI2D(locations, seedCount);
free(locations);
sparseRIV output;
consolidateD2S(&output, L2dense);
free(seeds);
return output;
}
#include <stdio.h>
#include <stdlib.h>
#include <strsafe.h>
#define SEEDMASK 25214903917
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
static void HandleError(cudaError_t err, const char *file, int line){
if(err !=cudaSuccess)
{
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
__global__ void squirt(float *d_magnitudes, int N){
int id =(blockIdx.x*blockDim.x + threadIdx.x);
if(id>=N) return;
d_magnitudes[id] = sqrt(d_magnitudes[id]);
}
__global__ void generateLocations(int *d_seeds, long long int mask, int *d_locations, int RIVsize, int team, int seedCount, int nonZeros){
int id =nonZeros*(blockIdx.x*blockDim.x + threadIdx.x)+team;
if(id>=seedCount) return;
d_locations[id] = ((d_seeds[id]^mask) & 2147483647) %(RIVsize);
}
__global__ void D2S( int* d_DenseRIV, int* d_SparseValues, int* d_SparseLocations, int *d_NZCount, int d_DenseSize){
int id =(blockIdx.x*blockDim.x + threadIdx.x);
if(id>=d_DenseSize) return;
int value = *(d_DenseRIV+id);
if(!value) return;
int sparseSlot = atomicAdd(d_NZCount, 1);
*(d_SparseValues+sparseSlot) = value;
*(d_SparseLocations+sparseSlot) = id;
}
__global__ void S2D(int *d_locations, int *d_values, int *d_OpenSlot, int numberOfValues){
int id = blockIdx.x*blockDim.x + threadIdx.x;
if(id>=numberOfValues) return ;
atomicAdd( d_OpenSlot + *(d_locations+id) , *(d_values+id));
}
__global__ void I2D(int *d_locations, int *d_OpenSlot, int numberOfValues){
int id = blockIdx.x*blockDim.x + threadIdx.x;
//bitshift
int value = (id%2) ? -1: 1;
if(id>=numberOfValues) return ;
atomicAdd( d_OpenSlot + *(d_locations+id) , value);
}
void consolidateD2SStaged(sparseRIV *destination, int *denseInput);
void consolidateD2S_d(sparseRIV *destination, int *denseInput);
void setKeyData_d(int RIVsize, int nonZeros, int blockSize);
int* mapS2D_d(int * destination, sparseRIV input);
float *getMagnitudes_d(sparseRIV *inputs, int RIVCount);
int *mapI2D_d(int *locations, int seedCount);
int* makeSparseLocations_d(int* seeds, int seedCount);
float *getMagnitudes_d(sparseRIV *inputs, int RIVCount){
float *magnitudes;
HANDLE_ERROR (cudaMallocHost((float**)&magnitudes,RIVCount*sizeof(float)));
float *magnitudes_slider = magnitudes;
for(int i=0; i<RIVCount; i++){
int temp = 0;
int *values = inputs[i].values;
int *values_stop = values+inputs[i].count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
*magnitudes_slider = temp;
magnitudes_slider++;
}
HANDLE_ERROR (cudaMalloc((void**)&RIVKeyData.d_magnitudes, RIVCount*sizeof(float)));
HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_magnitudes, magnitudes, RIVCount*sizeof(float), cudaMemcpyHostToDevice));
int blockSize;
int minGridSize = 0;
int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, squirt);
gridSize = ((RIVCount + blockSize -1) / blockSize)+1;
squirt<<<gridSize,blockSize >>> (RIVKeyData.d_magnitudes, RIVCount);
HANDLE_ERROR (cudaMemcpy (magnitudes, RIVKeyData.d_magnitudes, RIVCount*sizeof(float), cudaMemcpyDeviceToHost));
magnitudes_slider = magnitudes;
for(int i=0; i<RIVCount; i++){
inputs[i].magnitude = *magnitudes_slider;
magnitudes_slider++;
}
return magnitudes;
}
int *mapS2D_d(int* destination, sparseRIV input){
int *d_locations = RIVKeyData.d_OpenSlot+RIVKeyData.RIVsize;
int *d_values = d_locations+input.count;
HANDLE_ERROR (cudaMemset (RIVKeyData.d_OpenSlot, 0, RIVKeyData.RIVsize*sizeof(int)));
HANDLE_ERROR (cudaMemcpy (d_locations, input.locations, input.count*sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMemcpy (d_values, input.values, input.count*sizeof(int), cudaMemcpyHostToDevice));
int blockSize;
int minGridSize = 0;
int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, S2D);
gridSize = ((input.count + blockSize -1) / blockSize)+1;
S2D <<<gridSize,blockSize>>> (d_locations, d_values, RIVKeyData.d_OpenSlot, input.count);
HANDLE_ERROR (cudaMemcpy (destination, RIVKeyData.d_OpenSlot, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyDeviceToHost));
return destination;
}
int* mapI2D_d(int *locations, int valueCount){
int *d_locations = RIVKeyData.d_OpenSlot+RIVKeyData.RIVsize;
HANDLE_ERROR (cudaMemset (RIVKeyData.d_OpenSlot, 0, RIVKeyData.RIVsize*sizeof(int)));
HANDLE_ERROR (cudaMemcpy (d_locations, locations, valueCount*sizeof(int), cudaMemcpyHostToDevice));
int blockSize;
int minGridSize = 0;
int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, I2D);
gridSize = ((valueCount + blockSize -1) / blockSize)+1;
I2D <<<gridSize,blockSize>>> (d_locations, RIVKeyData.d_OpenSlot, valueCount);
int* valuesOut = RIVKeyData.h_tempBlock;
HANDLE_ERROR (cudaMemcpy (valuesOut, RIVKeyData.d_OpenSlot, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyDeviceToHost));
return valuesOut;
}
void consolidateD2SStaged(sparseRIV *destination, int *denseInput){
int count = 0;
int *locations = RIVKeyData.h_tempBlock;
int *values = RIVKeyData.h_tempBlock + RIVKeyData.RIVsize;
for(int i=0; i<RIVKeyData.RIVsize; i++){
if(denseInput[i]){
locations[count] = i;
values[count] = denseInput[i];
count++;
}
}
int *locations_slider = locations+count;
while(locations_slider>=locations){
RIVKeyData.h_staging_slider--;
locations_slider--;
*RIVKeyData.h_staging_slider = *locations_slider;
}
(*destination).locations = RIVKeyData.h_staging_slider;
int *values_slider = values+count;
while(values_slider>=values){
RIVKeyData.h_staging_slider--;
values_slider--;
*RIVKeyData.h_staging_slider = *values_slider;
}
(*destination).values = RIVKeyData.h_staging_slider;
RIVKeyData.h_staging_slider--;
*RIVKeyData.h_staging_slider = count;
*RIVKeyData.h_displacements = RIVKeyData.h_staging_slider -RIVKeyData.h_stagingBlock;
RIVKeyData.h_displacements++;
}
void consolidateD2S_d(sparseRIV *destination, int *denseInput){
int *d_valueCount;
HANDLE_ERROR (cudaMalloc((void**)&d_valueCount, sizeof(int)));
HANDLE_ERROR(cudaMemset(d_valueCount, 0, sizeof(int)));
HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_OpenSlot, denseInput, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyHostToDevice));
int *d_outValues = RIVKeyData.d_OpenSlot+RIVKeyData.RIVsize;
int *d_outLocations = d_outValues+RIVKeyData.RIVsize;
int blockSize;
int minGridSize = 0;
int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, D2S);
gridSize = ((RIVKeyData.RIVsize + blockSize -1) / blockSize)+1;
D2S <<<gridSize,blockSize>>> (RIVKeyData.d_OpenSlot, d_outValues, d_outLocations, d_valueCount, RIVKeyData.RIVsize);
cudaDeviceSynchronize();
HANDLE_ERROR (cudaMemcpy (&(*destination).count, d_valueCount, sizeof(int), cudaMemcpyDeviceToHost));
(*destination).locations = RIVKeyData.h_staging_slider;
RIVKeyData.h_staging_slider+=(*destination).count;
(*destination).values = RIVKeyData.h_staging_slider;
RIVKeyData.h_staging_slider+=(*destination).count;
HANDLE_ERROR (cudaMemcpy ((*destination).values, d_outValues, ((*destination).count)*sizeof(int), cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaMemcpy ((*destination).locations, d_outLocations, ((*destination).count)*sizeof(int), cudaMemcpyDeviceToHost));
cudaFree(d_valueCount);
}
void setKeyData_d(int RIVsize, int nonZeros, int blockSize){
RIVKeyData.RIVsize = RIVsize;
if(nonZeros%2){
printf("your nonZeros must be an even number");
nonZeros++;
printf(", changed to %d", nonZeros);
}
RIVKeyData.nonZeros = nonZeros;
RIVKeyData.masks = (long long int*)malloc(nonZeros*sizeof(long long int));
for(int i = 0; i<nonZeros; i++){
RIVKeyData.masks[i] = SEEDMASK>>(5*i);
}
HANDLE_ERROR (cudaMallocHost((void**)&RIVKeyData.h_tempBlock, blockSize*sizeof(int)));
HANDLE_ERROR (cudaMallocHost((void**)&RIVKeyData.h_stagingBlock, blockSize*sizeof(int)));
RIVKeyData.h_staging_stop = RIVKeyData.h_stagingBlock + blockSize;
RIVKeyData.h_staging_slider = RIVKeyData.h_staging_stop;
RIVKeyData.h_displacements = RIVKeyData.h_stagingBlock;
HANDLE_ERROR (cudaMalloc((void**)&RIVKeyData.d_OpenSlot, blockSize*sizeof(int)));
RIVKeyData.d_SlotEnd = RIVKeyData.d_OpenSlot+blockSize;
RIVKeyData.thing = 0;
}
int* makeSparseLocations_d(int* seeds, int seedCount){
int *d_locations = RIVKeyData.d_OpenSlot;
int *d_seeds = d_locations+seedCount;
HANDLE_ERROR (cudaMemcpy(d_seeds, seeds, seedCount*sizeof(int), cudaMemcpyHostToDevice));
int blockSize;
int minGridSize = 0;
int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, generateLocations);
gridSize = ((seedCount + blockSize -1) / (RIVKeyData.nonZeros*blockSize))+1;
long long int *mask = RIVKeyData.masks;
for(int team=0; team<RIVKeyData.nonZeros; team++){
generateLocations <<<gridSize,blockSize,team>>> (d_seeds, *mask, d_locations, RIVKeyData.RIVsize, team, seedCount, RIVKeyData.nonZeros);
mask++;
}
cudaDeviceSynchronize();
int *locations = RIVKeyData.h_tempBlock;
HANDLE_ERROR (cudaMemcpy(locations, d_locations, seedCount*sizeof(int), cudaMemcpyDeviceToHost));
return locations;
}
void addS2DsBlocked(int *denseBlock, sparseRIV additive, int RIVCount){
int *d_locations= RIVKeyData.d_OpenSlot+RIVCount*RIVKeyData.RIVsize;
int *d_values = d_locations+additive.count;
HANDLE_ERROR (cudaMemcpy (d_locations, additive.locations, additive.count*sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMemcpy (d_values, additive.values, additive.count*sizeof(int), cudaMemcpyHostToDevice));
int blockSize;
int minGridSize = 0;
int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, S2Ds);
gridSize = ((additive.count + blockSize -1) / blockSize)+1;
S2Ds<<<additive.count,1>>>(RIVKeyData.d_OpenSlot, d_locations, d_values, additive.count, RIVCount, RIVKeyData.RIVsize);
HANDLE_ERROR (cudaMemcpy (denseBlock, RIVKeyData.d_OpenSlot, RIVCount*RIVKeyData.RIVsize*sizeof(int), cudaMemcpyDeviceToHost));
}
#include <stdio.h>
#include <stdlib.h>
#include <strsafe.h>
#define SEEDMASK 25214903917
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
static void HandleError(cudaError_t err, const char *file, int line){
if(err !=cudaSuccess)
{
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
__global__ void squirt(float *d_magnitudes, int N){
int id =(blockIdx.x*blockDim.x + threadIdx.x);
if(id>=N) return;
d_magnitudes[id] = sqrt(d_magnitudes[id]);
}
__global__ void generateLocations(int *d_seeds, long long int mask, int *d_locations, int RIVsize, int team, int seedCount, int nonZeros){
int id =nonZeros*(blockIdx.x*blockDim.x + threadIdx.x)+team;
if(id>=seedCount) return;
d_locations[id] = ((d_seeds[id]^mask) & 2147483647) %(RIVsize);
}
__global__ void D2S( int* d_DenseRIV, int* d_SparseValues, int* d_SparseLocations, int *d_NZCount, int d_DenseSize){
int id =(blockIdx.x*blockDim.x + threadIdx.x);
if(id>=d_DenseSize) return;
int value = *(d_DenseRIV+id);
if(!value) return;
int sparseSlot = atomicAdd(d_NZCount, 1);
*(d_SparseValues+sparseSlot) = value;
*(d_SparseLocations+sparseSlot) = id;
}
__global__ void S2D(int *d_locations, int *d_values, int *d_OpenSlot, int numberOfValues){
int id = blockIdx.x*blockDim.x + threadIdx.x;
if(id>=numberOfValues) return ;
atomicAdd( d_OpenSlot + *(d_locations+id) , *(d_values+id));
}
__global__ void I2D(int *d_locations, int *d_OpenSlot, int numberOfValues){
int id = blockIdx.x*blockDim.x + threadIdx.x;
//bitshift
int value = (id%2) ? -1: 1;
if(id>=numberOfValues) return ;
atomicAdd( d_OpenSlot + *(d_locations+id) , value);
}
void consolidateD2SStaged(sparseRIV *destination, int *denseInput);
void consolidateD2S_d(sparseRIV *destination, int *denseInput);
void setKeyData_d(int RIVsize, int nonZeros, int blockSize);
int* mapS2D_d(int * destination, sparseRIV input);
float *getMagnitudes_d(sparseRIV *inputs, int RIVCount);
int *mapI2D_d(int *locations, int seedCount);
int* makeSparseLocations_d(int* seeds, int seedCount);
float *getMagnitudes_d(sparseRIV *inputs, int RIVCount){
float *magnitudes;
HANDLE_ERROR (cudaMallocHost((float**)&magnitudes,RIVCount*sizeof(float)));
float *magnitudes_slider = magnitudes;
for(int i=0; i<RIVCount; i++){
int temp = 0;
int *values = inputs[i].values;
int *values_stop = values+inputs[i].count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
*magnitudes_slider = temp;
magnitudes_slider++;
}
HANDLE_ERROR (cudaMalloc((void**)&RIVKeyData.d_magnitudes, RIVCount*sizeof(float)));
HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_magnitudes, magnitudes, RIVCount*sizeof(float), cudaMemcpyHostToDevice));
int blockSize;
int minGridSize = 0;
int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, squirt);
gridSize = ((RIVCount + blockSize -1) / blockSize)+1;
squirt<<<gridSize,blockSize >>> (RIVKeyData.d_magnitudes, RIVCount);
HANDLE_ERROR (cudaMemcpy (magnitudes, RIVKeyData.d_magnitudes, RIVCount*sizeof(float), cudaMemcpyDeviceToHost));
magnitudes_slider = magnitudes;
for(int i=0; i<RIVCount; i++){
inputs[i].magnitude = *magnitudes_slider;
magnitudes_slider++;
}
return magnitudes;
}
int *mapS2D_d(int* destination, sparseRIV input){
int *d_locations = RIVKeyData.d_OpenSlot+RIVKeyData.RIVsize;
int *d_values = d_locations+input.count;
HANDLE_ERROR (cudaMemset (RIVKeyData.d_OpenSlot, 0, RIVKeyData.RIVsize*sizeof(int)));
HANDLE_ERROR (cudaMemcpy (d_locations, input.locations, input.count*sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMemcpy (d_values, input.values, input.count*sizeof(int), cudaMemcpyHostToDevice));
int blockSize;
int minGridSize = 0;
int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, S2D);
gridSize = ((input.count + blockSize -1) / blockSize)+1;
S2D <<<gridSize,blockSize>>> (d_locations, d_values, RIVKeyData.d_OpenSlot, input.count);
HANDLE_ERROR (cudaMemcpy (destination, RIVKeyData.d_OpenSlot, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyDeviceToHost));
return destination;
}
int* mapI2D_d(int *locations, int valueCount){
int *d_locations = RIVKeyData.d_OpenSlot+RIVKeyData.RIVsize;
HANDLE_ERROR (cudaMemset (RIVKeyData.d_OpenSlot, 0, RIVKeyData.RIVsize*sizeof(int)));
HANDLE_ERROR (cudaMemcpy (d_locations, locations, valueCount*sizeof(int), cudaMemcpyHostToDevice));
int blockSize;
int minGridSize = 0;
int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, I2D);
gridSize = ((valueCount + blockSize -1) / blockSize)+1;
I2D <<<gridSize,blockSize>>> (d_locations, RIVKeyData.d_OpenSlot, valueCount);
int* valuesOut = RIVKeyData.h_tempBlock;
HANDLE_ERROR (cudaMemcpy (valuesOut, RIVKeyData.d_OpenSlot, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyDeviceToHost));
return valuesOut;
}
void consolidateD2SStaged(sparseRIV *destination, int *denseInput){
int count = 0;
int *locations = RIVKeyData.h_tempBlock;
int *values = RIVKeyData.h_tempBlock + RIVKeyData.RIVsize;
for(int i=0; i<RIVKeyData.RIVsize; i++){
if(denseInput[i]){
locations[count] = i;
values[count] = denseInput[i];
count++;
}
}
int *locations_slider = locations+count;
while(locations_slider>=locations){
RIVKeyData.h_staging_slider--;
locations_slider--;
*RIVKeyData.h_staging_slider = *locations_slider;
}
(*destination).locations = RIVKeyData.h_staging_slider;
int *values_slider = values+count;
while(values_slider>=values){
RIVKeyData.h_staging_slider--;
values_slider--;
*RIVKeyData.h_staging_slider = *values_slider;
}
(*destination).values = RIVKeyData.h_staging_slider;
RIVKeyData.h_staging_slider--;
*RIVKeyData.h_staging_slider = count;
*RIVKeyData.h_displacements = RIVKeyData.h_staging_slider -RIVKeyData.h_stagingBlock;
RIVKeyData.h_displacements++;
}
void consolidateD2S_d(sparseRIV *destination, int *denseInput){
int *d_valueCount;
HANDLE_ERROR (cudaMalloc((void**)&d_valueCount, sizeof(int)));
HANDLE_ERROR(cudaMemset(d_valueCount, 0, sizeof(int)));
HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_OpenSlot, denseInput, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyHostToDevice));
int *d_outValues = RIVKeyData.d_OpenSlot+RIVKeyData.RIVsize;
int *d_outLocations = d_outValues+RIVKeyData.RIVsize;
int blockSize;
int minGridSize = 0;
int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, D2S);
gridSize = ((RIVKeyData.RIVsize + blockSize -1) / blockSize)+1;
D2S <<<gridSize,blockSize>>> (RIVKeyData.d_OpenSlot, d_outValues, d_outLocations, d_valueCount, RIVKeyData.RIVsize);
cudaDeviceSynchronize();
HANDLE_ERROR (cudaMemcpy (&(*destination).count, d_valueCount, sizeof(int), cudaMemcpyDeviceToHost));
(*destination).locations = RIVKeyData.h_staging_slider;
RIVKeyData.h_staging_slider+=(*destination).count;
(*destination).values = RIVKeyData.h_staging_slider;
RIVKeyData.h_staging_slider+=(*destination).count;
HANDLE_ERROR (cudaMemcpy ((*destination).values, d_outValues, ((*destination).count)*sizeof(int), cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaMemcpy ((*destination).locations, d_outLocations, ((*destination).count)*sizeof(int), cudaMemcpyDeviceToHost));
cudaFree(d_valueCount);
}
void setKeyData_d(int RIVsize, int nonZeros, int blockSize){
RIVKeyData.RIVsize = RIVsize;
if(nonZeros%2){
printf("your nonZeros must be an even number");
nonZeros++;
printf(", changed to %d", nonZeros);
}
RIVKeyData.nonZeros = nonZeros;
RIVKeyData.masks = (long long int*)malloc(nonZeros*sizeof(long long int));
for(int i = 0; i<nonZeros; i++){
RIVKeyData.masks[i] = SEEDMASK>>(5*i);
}
HANDLE_ERROR (cudaMallocHost((void**)&RIVKeyData.h_tempBlock, blockSize*sizeof(int)));
HANDLE_ERROR (cudaMallocHost((void**)&RIVKeyData.h_stagingBlock, blockSize*sizeof(int)));
RIVKeyData.h_staging_stop = RIVKeyData.h_stagingBlock + blockSize;
RIVKeyData.h_staging_slider = RIVKeyData.h_staging_stop;
RIVKeyData.h_displacements = RIVKeyData.h_stagingBlock;
HANDLE_ERROR (cudaMalloc((void**)&RIVKeyData.d_OpenSlot, blockSize*sizeof(int)));
RIVKeyData.d_SlotEnd = RIVKeyData.d_OpenSlot+blockSize;
RIVKeyData.thing = 0;
}
int* makeSparseLocations_d(int* seeds, int seedCount){
int *d_locations = RIVKeyData.d_OpenSlot;
int *d_seeds = d_locations+seedCount;
HANDLE_ERROR (cudaMemcpy(d_seeds, seeds, seedCount*sizeof(int), cudaMemcpyHostToDevice));
int blockSize;
int minGridSize = 0;
int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, generateLocations);
gridSize = ((seedCount + blockSize -1) / (RIVKeyData.nonZeros*blockSize))+1;
long long int *mask = RIVKeyData.masks;
for(int team=0; team<RIVKeyData.nonZeros; team++){
generateLocations <<<gridSize,blockSize,team>>> (d_seeds, *mask, d_locations, RIVKeyData.RIVsize, team, seedCount, RIVKeyData.nonZeros);
mask++;
}
cudaDeviceSynchronize();
int *locations = RIVKeyData.h_tempBlock;
HANDLE_ERROR (cudaMemcpy(locations, d_locations, seedCount*sizeof(int), cudaMemcpyDeviceToHost));
return locations;
}
void addS2DsBlocked(int *denseBlock, sparseRIV additive, int RIVCount){
int *d_locations= RIVKeyData.d_OpenSlot+RIVCount*RIVKeyData.RIVsize;
int *d_values = d_locations+additive.count;
HANDLE_ERROR (cudaMemcpy (d_locations, additive.locations, additive.count*sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMemcpy (d_values, additive.values, additive.count*sizeof(int), cudaMemcpyHostToDevice));
int blockSize;
int minGridSize = 0;
int gridSize;
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, S2Ds);
gridSize = ((additive.count + blockSize -1) / blockSize)+1;
S2Ds<<<additive.count,1>>>(RIVKeyData.d_OpenSlot, d_locations, d_values, additive.count, RIVCount, RIVKeyData.RIVsize);
HANDLE_ERROR (cudaMemcpy (denseBlock, RIVKeyData.d_OpenSlot, RIVCount*RIVKeyData.RIVsize*sizeof(int), cudaMemcpyDeviceToHost));
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "RIVLowerMorphic.h"
#include "RIVaccessories.h"
/* lexPush writes a denseRIV to a file for permanent storage */
int lexPush(denseRIV RIVout);
/* lexPull reads an existing lexicon entry (under directory "lexicon")
* and creates a denseRIV with those attributes.
* if the file does not exist, it creates a 0 vector with the name of word
*/
denseRIV lexPull(char* word);
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each
* word contained
*/
sparseRIV fileToL2(FILE *input);
/* fileToL2Clean operates the same as fileToL2 butkeeps only words
* containing lowercase letters and the '_' symbol
* this is important if you will be lexPush-ing those words later
*/
sparseRIV fileToL2Clean(FILE *data);
sparseRIV fileToL2direct(FILE *data);
/*cosine determines the "similarity" between two RIVs. */
float cosCompare(denseRIV baseRIV, sparseRIV comparator);
sparseRIV wordtoL2(char* word);
sparseRIV consolidateI2S(int *implicit, size_t valueCount);
sparseRIV text2L2(char *text);
sparseRIV text2L2(char *text){
unsigned int blockSize;
char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
unsigned int locationCount = 0;
int displacement;
while(sscanf(text, "%99s%n", word, &displacement)){
text += displacement+1;
if(!displacement){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations((unsigned char*)word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
*(output.frequency) = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2(FILE *data){
unsigned int blockSize;
unsigned char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
output.frequency = malloc(1*sizeof(int));
/* frequency records the number of words in this file */
*(output.frequency) = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2Clean(FILE *data){
unsigned char word[100] = {0};
int *locations = RIVKey.h_tempBlock;
unsigned int blockSize;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
/* if the word is not clean, skip it */
if(!isWordClean((char*)word)){
continue;
}
blockSize = locationCount+NONZEROS;
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
*(output.frequency) = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount>RIVKey.I2SThreshold){
return consolidateI2SIndirect(implicit, valueCount);
}else{
return consolidateI2SDirect(implicit, valueCount);
}
}
void aggregateWord2D(denseRIV destination, char* word){
//makeSparseLocations((unsigned char*)word, locationSlot, 0);
srand(wordtoSeed((unsigned char*)word));
for(int i=0; i<NONZEROS; i++){
destination.values[(rand()%RIVSIZE)] +=1;
destination.values[(rand()%RIVSIZE)] -= 1;
}
}
float cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0;
int *values = comparator.values;
int *locations = comparator.locations;
int *locations_Stop = locations+comparator.count;
while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine */
dot += (*values)*(*(baseRIV.values+(*locations)));
locations++;
values++;
}
float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
return cosine;
}
float getMagnitudeSparse(RIV input){
size_t count;
if(input.flags & SPARSE){
count = input.count;
}else{
count = RIVSIZE;
}
unsigned long long int temp = 0;
int *values = input.values;
int *values_stop = values+count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
float magnitude = sqrt(temp);
input.magnitude = magnitude;
return magnitude;
}
denseRIV lexPull(char* word){
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
srand(wordtoSeed((unsigned char*)word));
int hash = rand()%RIVKey.cacheSize;
if(!strcmp(word, RIVKey.RIVCache[hash].name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
#endif /* CACHESIZE > 0 */
denseRIV output;
char pathString[200];
sprintf(pathString, "lexicon/%s", word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector */
output = denseAllocate();
}
strcpy(output.name, word);
return output;
}
int lexPush(denseRIV RIVout){
//printf("%s\n", (*RIVout).name);
#if CACHESIZE == 0
fLexPush(RIVout);
return 0;
#else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated to the cache
* or the RIV was pushed out from under it, in which case it has already been pushed*/
if(RIVout.cached){
return 0;
}
srand(wordtoSeed((unsigned char*)RIVout.name));
int hash = rand()%RIVKey.cacheSize;
if(!RIVKey.RIVCache[hash].cached){
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return 0;
/*if the current RIV is more frequent than the RIV holding it's slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
//scanf("%f", &(*RIVout).magnitude);
//printf("%s replacing %s\n", (*RIVout).name, RIVKey.RIVCache[hash].name);
/* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]);
/* replace the cache entry with the currrent RIV */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return diag;
}else{
/* push current RIV to file */
fLexPush(RIVout);
}
return 0;
#endif /* CACHESIZE == 0 */
}
sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0};
denseRIV denseTemp;
// a temporary dense RIV is stored in the tempBlock
denseTemp.values = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
int count = 0;
while(fscanf(data, "%99s", word)){
count++;
if(feof(data)){
break;
}
if(!(*word)){
break;
}
// add word's L1 RIV to the accumulating implicit RIV
aggregateWord2D(denseTemp, (char*)word);
}
sparseRIV output = consolidateD2S(denseTemp.values);
// frequency records the number of words in this file
*(output.frequency) = count;
output.boolean = 1;
return output;
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "RIVLowerMorphic.h"
#include "RIVaccessories.h"
/* lexPush writes a denseRIV to a file for permanent storage */
int lexPush(denseRIV RIVout);
/* lexPull reads an existing lexicon entry (under directory "lexicon")
* and creates a denseRIV with those attributes.
* if the file does not exist, it creates a 0 vector with the name of word
*/
denseRIV lexPull(char* word);
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each
* word contained
*/
sparseRIV fileToL2(FILE *input);
/* fileToL2Clean operates the same as fileToL2 butkeeps only words
* containing lowercase letters and the '_' symbol
* this is important if you will be lexPush-ing those words later
*/
sparseRIV fileToL2Clean(FILE *data);
sparseRIV fileToL2direct(FILE *data);
/*cosine determines the "similarity" between two RIVs. */
float cosCompare(denseRIV baseRIV, sparseRIV comparator);
sparseRIV wordtoL2(char* word);
sparseRIV consolidateI2S(int *implicit, size_t valueCount);
sparseRIV text2L2(char *text);
sparseRIV text2L2(char *text){
unsigned int blockSize;
char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
unsigned int locationCount = 0;
int displacement;
while(sscanf(text, "%99s%n", word, &displacement)){
text += displacement+1;
if(!displacement){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations((unsigned char*)word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
*(output.frequency) = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2(FILE *data){
unsigned int blockSize;
unsigned char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
output.frequency = malloc(1*sizeof(int));
/* frequency records the number of words in this file */
*(output.frequency) = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2Clean(FILE *data){
unsigned char word[100] = {0};
int *locations = RIVKey.h_tempBlock;
unsigned int blockSize;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
/* if the word is not clean, skip it */
if(!isWordClean((char*)word)){
continue;
}
blockSize = locationCount+NONZEROS;
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
*(output.frequency) = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount>RIVKey.I2SThreshold){
return consolidateI2SIndirect(implicit, valueCount);
}else{
return consolidateI2SDirect(implicit, valueCount);
}
}
void aggregateWord2D(denseRIV destination, char* word){
//makeSparseLocations((unsigned char*)word, locationSlot, 0);
srand(wordtoSeed((unsigned char*)word));
for(int i=0; i<NONZEROS; i++){
destination.values[(rand()%RIVSIZE)] +=1;
destination.values[(rand()%RIVSIZE)] -= 1;
}
}
float cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0;
int *values = comparator.values;
int *locations = comparator.locations;
int *locations_Stop = locations+comparator.count;
while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine */
dot += (*values)*(*(baseRIV.values+(*locations)));
locations++;
values++;
}
float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
return cosine;
}
float getMagnitudeSparse(RIV input){
size_t count;
if(input.flags & SPARSE){
count = input.count;
}else{
count = RIVSIZE;
}
unsigned long long int temp = 0;
int *values = input.values;
int *values_stop = values+count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
float magnitude = sqrt(temp);
input.magnitude = magnitude;
return magnitude;
}
denseRIV lexPull(char* word){
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
srand(wordtoSeed((unsigned char*)word));
int hash = rand()%RIVKey.cacheSize;
if(!strcmp(word, RIVKey.RIVCache[hash].name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
#endif /* CACHESIZE > 0 */
denseRIV output;
char pathString[200];
sprintf(pathString, "lexicon/%s", word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector */
output = denseAllocate();
}
strcpy(output.name, word);
return output;
}
int lexPush(denseRIV RIVout){
//printf("%s\n", (*RIVout).name);
#if CACHESIZE == 0
fLexPush(RIVout);
return 0;
#else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated to the cache
* or the RIV was pushed out from under it, in which case it has already been pushed*/
if(RIVout.cached){
return 0;
}
srand(wordtoSeed((unsigned char*)RIVout.name));
int hash = rand()%RIVKey.cacheSize;
if(!RIVKey.RIVCache[hash].cached){
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return 0;
/*if the current RIV is more frequent than the RIV holding it's slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
//scanf("%f", &(*RIVout).magnitude);
//printf("%s replacing %s\n", (*RIVout).name, RIVKey.RIVCache[hash].name);
/* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]);
/* replace the cache entry with the currrent RIV */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return diag;
}else{
/* push current RIV to file */
fLexPush(RIVout);
}
return 0;
#endif /* CACHESIZE == 0 */
}
sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0};
denseRIV denseTemp;
// a temporary dense RIV is stored in the tempBlock
denseTemp.values = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
int count = 0;
while(fscanf(data, "%99s", word)){
count++;
if(feof(data)){
break;
}
if(!(*word)){
break;
}
// add word's L1 RIV to the accumulating implicit RIV
aggregateWord2D(denseTemp, (char*)word);
}
sparseRIV output = consolidateD2S(denseTemp.values);
// frequency records the number of words in this file
*(output.frequency) = count;
output.boolean = 1;
return output;
}
File added
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#include "RIVtoolsCPUlinux.h"
void directoryToL2s(char *rootString);
int main(){
RIVInit();
char rootString[] = "lexicon/";
directoryToL2s(rootString);
}
void directoryToL2s(char *rootString){
sparseRIV fileRIV;
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
denseRIV temp = lexPull(pathString);
fileRIV = consolidateD2S(temp.values);
strcpy(fileRIV.name, pathString);
float count = fileRIV.count;
printf("%s, saturation: %f\n", fileRIV.name, count);
fclose(input);
free(temp.values);
//free(fileRIV.locations);
}
}
}
File added
rabi_noun s._noun de_noun rabide@yahoo.com prospect_noun place_noun home_noun bellaire_noun tx_noun work_noun
objective_noun financial_adjective engineering_noun position_noun energy_noun trading_noun finance_noun
profile_noun over_other ten_noun year_noun diverse_adjective experience_noun risk_noun analysis_noun management_noun energy_noun sector_noun last_adjective four_noun which_other be_verb trading_noun finance_noun
analytical_noun quantitative_adjective skill_noun structuring_noun pricing_noun energy_noun derivative_noun
expertise_noun trading_noun derivative_noun development_noun trade_noun analytic_noun exposure_noun management_noun risk_verb structure_verb e&p_noun project_noun finance_noun transaction_noun
experience_noun shell_noun capital_noun inc._noun houston_noun tx_noun
present_adjective vice_noun president_noun reports_noun chief_noun financial_noun officer_noun responsible_adjective devise_verb strategy_noun manage_verb price_verb market_noun credit_noun risk_noun within_other structured_adjective transaction_noun
design_verb execute_verb oil_noun gas_noun hedge_noun eight_noun domestic_adjective two_noun international_adjective transaction_noun involve_verb over_other million_noun capital_noun risk_noun
develop_verb implement_verb framework_noun identification_noun mitigation_noun pricing_noun risk_noun producer_noun finance_noun transaction_noun
provide_verb sophisticated_adjective simulation_noun modeling_noun support_noun financial_adjective engineering_noun solution_noun e&p_noun finance_noun leasing_noun small_adjective business_noun finance_noun
led_verb development_noun computational_adjective infrastructure_noun risk_noun modeling_noun pricing_noun
shell_noun oil_noun products_noun company_noun houston_noun tx_noun
trade_noun analytics_noun developer_noun derivatives_noun trader_noun traded_noun future_noun option_noun otc_noun derivative_noun crude_adjective oil_noun heating_noun oil_noun gasoline_noun
manage_verb net_adjective hydrocarbon_noun exposure_noun company_noun
develop_verb analytic_noun identify_verb speculative_adjective program_noun trading_noun opportunity_noun e.g._other refinery_noun margin_noun protection_noun
carry_verb out_adverb simulation_noun back_adverb testing_noun risk_noun adjusted_adjective performance_noun measurement_noun trading_noun strategy_noun
price_verb embedded_adjective cap_noun devise_verb strategy_noun option_noun replication_noun dynamic_adjective hedging_noun
shell_noun e&p_noun technology_noun company_noun houston_noun tx_noun
senior_noun research_noun engineer_noun research_noun engineer_noun use_verb reliability_noun analysis_noun solve_verb wide_adjective variety_noun engineering_noun problem_noun
model_verb environmental_adjective structural_adjective response_noun develop_verb design_noun code_noun criterion_noun carry_verb out_adverb decision_noun analysis_noun under_other uncertainty_noun surface_noun system_noun selection_noun etc._other
deliver_verb enable_verb technology_noun risk-based_adjective design_noun recipe_noun development_noun complex_adjective engineering_noun system_noun range_verb billion-dollar_adjective tension_noun leg_noun platform_noun requalification_noun aging_noun fleet_noun offshore_adjective jacket_noun structure_noun
brown_adjective root_noun inc._noun houston_noun tx_noun
naval_noun architect_noun software_noun troubleshooter_noun carry_verb out_adverb naval_noun architectural_noun design_noun motion_noun response_noun modeling_noun downtime_noun analysis_noun environmental_adjective datum_noun base_noun management_noun software_noun development_noun maintenance_noun support_noun offshore_adjective structure_noun design_noun construction_noun
education_noun university_noun california_noun berkeley_noun ca_noun
ph.d._noun naval_noun architecture_noun offshore_noun engineering_noun minor_noun statistics_noun structure_noun thesis_noun offshore_noun structural_noun system_noun reliability_noun wave-load_noun modeling_noun system_noun behavior_noun analysis_noun
probabilistically_adverb model_verb multidimensional_adjective hazard_noun effect_noun performance_noun complicated_adjective system_noun develop_verb methodology_noun characterize_verb system_noun failure_noun risk_noun
work_verb research_noun associate_noun reliability_noun marine_noun structures_noun center_noun stanford_noun university_noun consultant_noun offshore_adjective oil_noun gas_noun industry_noun
university_noun california_noun berkeley_noun ca_noun
m.s._noun naval_noun architecture_noun offshore_noun engineering_noun minor_noun statistics_noun structure_noun thesis_noun simulation_noun random_noun seaway_noun towing_noun tank_noun random_noun walk_verb frequency_noun method_noun
work_verb research_noun assistant_noun develop_verb software_noun time_noun series_noun analysis_noun model_noun testing_noun calibration_noun
indian_noun institute_noun technology_noun kharagpur_noun india_noun
b.tech._noun naval_noun architecture_noun graduate_verb first_adjective class_noun honor_noun rank_verb first_adverb class_noun
relevant_adjective training_noun credit_noun risk_noun modeling_noun stanford_noun university_noun stanford_noun ca_noun
october_noun finance_noun accounting_noun executive_noun rice_noun university_noun houston_noun tx_noun
august_noun training_noun modules_noun product_noun knowledge_noun structured_noun project_noun finance_noun securitization_noun credit_noun strategy_noun in-house_noun training_noun dc_noun gardener_noun euromoney_noun
january_noun april_noun risk_noun risk_noun conference_noun washington_noun d.c._noun june_noun economics_noun supply_noun refining_noun marketing_noun stone_noun bond_noun corp._noun houston_noun tx_noun
april_noun understand_verb apply_verb financial_adjective mathematics_noun energy_noun derivative_noun efficient_adjective pricing_noun trading_noun risk_noun management_noun risk_noun conferences_noun new_noun york_noun ny_noun
march_noun practical_noun strategic_noun application_noun var_noun energy_noun industries_noun risk_noun conferences_noun houston_noun tx_noun
december_noun financial_noun modeling_noun s-plus_noun mathsoft_noun new_noun york_noun ny_noun
october_noun option_noun analytic_noun pricing_noun option_noun exotic_noun options_noun cibc_noun school_noun financial_noun products_noun houston_noun tx_noun
september_noun fundamental_noun energy_noun basis_noun trading_noun princeton_noun energy_noun houston_noun tx_noun
feb_noun energy_noun derivatives_noun price_noun risk_noun management_noun energy_noun institute_noun univ._noun houston_noun houston_noun tx_noun
january_noun april_noun latest_adjective development_noun advanced_noun mathematics_noun derivative_noun risk_noun conference_noun new_noun york_noun ny_noun
december_noun options_noun seminar_noun nymex_noun houston_noun tx_noun
october_noun
select_verb honors_noun activities_noun present_verb seminar_noun credit_noun risk_noun e&p_noun mezzanine_noun finance_noun global_noun association_noun risk_noun professional_noun houston_noun chapter_noun tx_noun
june_noun special_adjective recognition_noun award_noun shell_noun oil_noun products_noun company_noun
committee_noun membership_noun panelist_noun author_noun lecturer_noun publication_noun reviewer_noun etc._other
asce_noun api_noun otc_noun asme_noun omae_noun etc._other
receive_verb omae_noun award_noun american_noun society_noun mechanical_adjective engineering_noun recognition_noun outstanding_adjective originality_noun significance_noun paper_noun title_verb development_noun reliability-based_adjective global_adjective design_noun equation_noun tension_noun leg_noun platforms_noun
short_adjective course_noun instructor_noun seminar_noun speaker_noun university_noun texas_noun austin_noun rice_noun university_noun university_noun houston_noun
sea_noun grant_noun association_noun award_noun excellence_noun research_noun sea_noun grant_noun association_noun usa_noun
institute_noun silver_noun medal_noun indian_noun institute_noun technology_noun kharagpur_noun india_noun
national_noun science_noun talent_noun search_verb scholarship_noun government_noun india_noun
personal_adjective data_noun date_noun birth_noun september_noun us_noun citizen_noun marry_verb one_noun child_noun
reference_noun available_adjective upon_other request_noun
document_noun properties_noun title_noun rabi_noun s_noun author_noun shell_noun chemical_noun company_noun template_noun normal_adjective last_adjective save_verb grady_adjective revision_noun number_noun application_noun microsoft_noun word_noun
total_adjective editing_noun time_noun last_adjective print_verb create_verb last_adjective save_verb company_noun shell_noun chemical_noun company_noun
rabi_noun s._noun de_noun rabide@yahoo.com prospect_noun place_noun home_noun bellaire_noun tx_noun work_noun
objective_noun financial_adjective engineering_noun position_noun energy_noun trading_noun finance_noun
profile_noun over_other ten_noun year_noun diverse_adjective experience_noun risk_noun analysis_noun management_noun energy_noun sector_noun last_adjective four_noun which_other be_verb trading_noun finance_noun
analytical_noun quantitative_adjective skill_noun structuring_noun pricing_noun energy_noun derivative_noun
expertise_noun trading_noun derivative_noun development_noun trade_noun analytic_noun exposure_noun management_noun risk_verb structure_verb e&p_noun project_noun finance_noun transaction_noun
experience_noun shell_noun capital_noun inc._noun houston_noun tx_noun
present_adjective vice_noun president_noun reports_noun chief_noun financial_noun officer_noun responsible_adjective devise_verb strategy_noun manage_verb price_verb market_noun credit_noun risk_noun within_other structured_adjective transaction_noun
design_verb execute_verb oil_noun gas_noun hedge_noun eight_noun domestic_adjective two_noun international_adjective transaction_noun involve_verb over_other million_noun capital_noun risk_noun
develop_verb implement_verb framework_noun identification_noun mitigation_noun pricing_noun risk_noun producer_noun finance_noun transaction_noun
provide_verb sophisticated_adjective simulation_noun modeling_noun support_noun financial_adjective engineering_noun solution_noun e&p_noun finance_noun leasing_noun small_adjective business_noun finance_noun
led_verb development_noun computational_adjective infrastructure_noun risk_noun modeling_noun pricing_noun
shell_noun oil_noun products_noun company_noun houston_noun tx_noun
trade_noun analytics_noun developer_noun derivatives_noun trader_noun traded_noun future_noun option_noun otc_noun derivative_noun crude_adjective oil_noun heating_noun oil_noun gasoline_noun
manage_verb net_adjective hydrocarbon_noun exposure_noun company_noun
develop_verb analytic_noun identify_verb speculative_adjective program_noun trading_noun opportunity_noun e.g._other refinery_noun margin_noun protection_noun
carry_verb out_adverb simulation_noun back_adverb testing_noun risk_noun adjusted_adjective performance_noun measurement_noun trading_noun strategy_noun
price_verb embedded_adjective cap_noun devise_verb strategy_noun option_noun replication_noun dynamic_adjective hedging_noun
shell_noun e&p_noun technology_noun company_noun houston_noun tx_noun
senior_noun research_noun engineer_noun research_noun engineer_noun use_verb reliability_noun analysis_noun solve_verb wide_adjective variety_noun engineering_noun problem_noun
model_verb environmental_adjective structural_adjective response_noun develop_verb design_noun code_noun criterion_noun carry_verb out_adverb decision_noun analysis_noun under_other uncertainty_noun surface_noun system_noun selection_noun etc._other
deliver_verb enable_verb technology_noun risk-based_adjective design_noun recipe_noun development_noun complex_adjective engineering_noun system_noun range_verb billion-dollar_adjective tension_noun leg_noun platform_noun requalification_noun aging_noun fleet_noun offshore_adjective jacket_noun structure_noun
brown_adjective root_noun inc._noun houston_noun tx_noun
naval_noun architect_noun software_noun troubleshooter_noun carry_verb out_adverb naval_noun architectural_noun design_noun motion_noun response_noun modeling_noun downtime_noun analysis_noun environmental_adjective datum_noun base_noun management_noun software_noun development_noun maintenance_noun support_noun offshore_adjective structure_noun design_noun construction_noun
education_noun university_noun california_noun berkeley_noun ca_noun
ph.d._noun naval_noun architecture_noun offshore_noun engineering_noun minor_noun statistics_noun structure_noun thesis_noun offshore_noun structural_noun system_noun reliability_noun wave-load_noun modeling_noun system_noun behavior_noun analysis_noun
probabilistically_adverb model_verb multidimensional_adjective hazard_noun effect_noun performance_noun complicated_adjective system_noun develop_verb methodology_noun characterize_verb system_noun failure_noun risk_noun
work_verb research_noun associate_noun reliability_noun marine_noun structures_noun center_noun stanford_noun university_noun consultant_noun offshore_adjective oil_noun gas_noun industry_noun
university_noun california_noun berkeley_noun ca_noun
m.s._noun naval_noun architecture_noun offshore_noun engineering_noun minor_noun statistics_noun structure_noun thesis_noun simulation_noun random_noun seaway_noun towing_noun tank_noun random_noun walk_verb frequency_noun method_noun
work_verb research_noun assistant_noun develop_verb software_noun time_noun series_noun analysis_noun model_noun testing_noun calibration_noun
indian_noun institute_noun technology_noun kharagpur_noun india_noun
b.tech._noun naval_noun architecture_noun graduate_verb first_adjective class_noun honor_noun rank_verb first_adverb class_noun
relevant_adjective training_noun credit_noun risk_noun modeling_noun stanford_noun university_noun stanford_noun ca_noun
october_noun finance_noun accounting_noun executive_noun rice_noun university_noun houston_noun tx_noun
august_noun training_noun modules_noun product_noun knowledge_noun structured_noun project_noun finance_noun securitization_noun credit_noun strategy_noun in-house_noun training_noun dc_noun gardener_noun euromoney_noun
january_noun april_noun risk_noun risk_noun conference_noun washington_noun d.c._noun june_noun economics_noun supply_noun refining_noun marketing_noun stone_noun bond_noun corp._noun houston_noun tx_noun
april_noun understand_verb apply_verb financial_adjective mathematics_noun energy_noun derivative_noun efficient_adjective pricing_noun trading_noun risk_noun management_noun risk_noun conferences_noun new_noun york_noun ny_noun
march_noun practical_noun strategic_noun application_noun var_noun energy_noun industries_noun risk_noun conferences_noun houston_noun tx_noun
december_noun financial_noun modeling_noun s-plus_noun mathsoft_noun new_noun york_noun ny_noun
october_noun option_noun analytic_noun pricing_noun option_noun exotic_noun options_noun cibc_noun school_noun financial_noun products_noun houston_noun tx_noun
september_noun fundamental_noun energy_noun basis_noun trading_noun princeton_noun energy_noun houston_noun tx_noun
feb_noun energy_noun derivatives_noun price_noun risk_noun management_noun energy_noun institute_noun univ._noun houston_noun houston_noun tx_noun
january_noun april_noun latest_adjective development_noun advanced_noun mathematics_noun derivative_noun risk_noun conference_noun new_noun york_noun ny_noun
december_noun options_noun seminar_noun nymex_noun houston_noun tx_noun
october_noun
select_verb honors_noun activities_noun present_verb seminar_noun credit_noun risk_noun e&p_noun mezzanine_noun finance_noun global_noun association_noun risk_noun professional_noun houston_noun chapter_noun tx_noun
june_noun special_adjective recognition_noun award_noun shell_noun oil_noun products_noun company_noun
committee_noun membership_noun panelist_noun author_noun lecturer_noun publication_noun reviewer_noun etc._other
asce_noun api_noun otc_noun asme_noun omae_noun etc._other
receive_verb omae_noun award_noun american_noun society_noun mechanical_adjective engineering_noun recognition_noun outstanding_adjective originality_noun significance_noun paper_noun title_verb development_noun reliability-based_adjective global_adjective design_noun equation_noun tension_noun leg_noun platforms_noun
short_adjective course_noun instructor_noun seminar_noun speaker_noun university_noun texas_noun austin_noun rice_noun university_noun university_noun houston_noun
sea_noun grant_noun association_noun award_noun excellence_noun research_noun sea_noun grant_noun association_noun usa_noun
institute_noun silver_noun medal_noun indian_noun institute_noun technology_noun kharagpur_noun india_noun
national_noun science_noun talent_noun search_verb scholarship_noun government_noun india_noun
personal_adjective data_noun date_noun birth_noun september_noun us_noun citizen_noun marry_verb one_noun child_noun
reference_noun available_adjective upon_other request_noun
document_noun properties_noun title_noun rabi_noun s_noun author_noun shell_noun chemical_noun company_noun template_noun normal_adjective last_adjective save_verb grady_adjective revision_noun number_noun application_noun microsoft_noun word_noun
total_adjective editing_noun time_noun last_adjective print_verb create_verb last_adjective save_verb company_noun shell_noun chemical_noun company_noun
meet_verb discuss_verb non-grid_adjective am_noun process_verb attendee_noun julia_noun lynn_noun steve_noun sheila_noun
calendar_noun entry_noun appointment_noun
description_noun meet_verb discuss_verb non-grid_adjective am_noun process_verb attendee_noun julia_noun lynn_noun steve_noun sheila_noun jerry_noun conference_noun room_noun
date_noun time_noun pm_noun pm_noun central_noun standard_noun time_noun
detailed_adjective description_noun united_noun states_noun license_noun
meet_verb discuss_verb non-grid_adjective am_noun process_verb attendee_noun julia_noun lynn_noun steve_noun sheila_noun
calendar_noun entry_noun appointment_noun
description_noun meet_verb discuss_verb non-grid_adjective am_noun process_verb attendee_noun julia_noun lynn_noun steve_noun sheila_noun jerry_noun conference_noun room_noun
date_noun time_noun pm_noun pm_noun central_noun standard_noun time_noun
detailed_adjective description_noun united_noun states_noun license_noun
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment