Commit f258c284 by simetk

begining GPU integration

parent f2876a83
...@@ -145,5 +145,316 @@ denseRIV denseAllocate(); ...@@ -145,5 +145,316 @@ denseRIV denseAllocate();
void signalSecure(int signum, siginfo_t *si, void* arg); void signalSecure(int signum, siginfo_t *si, void* arg);
/* begin definitions */ /* begin definitions */
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp);
free(denseTemp);
return sparseOut;
}
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
sparseRIV sparseOut;
int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
sparseOut.count = 0;
int add = 1;
int found;
for(int i=0; i<valueCount; i++){
found = 0;
for(int j=0; j<sparseOut.count; j++){
if(implicit[i] == locationsTemp[j]){
valuesTemp[i] += add;
add *= -1;
found = 1;
}
}
if(!found){
locationsTemp[sparseOut.count] = implicit[i];
valuesTemp[sparseOut.count] = add;
sparseOut.count++;
add*= -1;
}
}
sparseOut.locations = (int*)malloc(2*sparseOut.count*sizeof(int));
sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, sparseOut.count*sizeof(int));
memcpy(sparseOut.values, valuesTemp, sparseOut.count*sizeof(int));
return sparseOut;
}
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
return output;
}
void RIVInit(){
RIVKey.I2SThreshold = sqrt(RIVSIZE);
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
struct sigaction action;
action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO;
//for(int i=1; i<27; i++){
sigaction(11,&action,NULL);
//}
RIVKey.h_tempBlock = (int*)malloc(3*RIVSIZE*sizeof(int));
RIVKey.tempSize = 3*RIVSIZE;
RIVKey.thing = 0;
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE);
}
void RIVCleanup(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
free(RIVKey.h_tempBlock);
}
int wordtoSeed(unsigned char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words
* this means letters are taken as characters couned in base 32, which
* should be large enough to hold all english characters plus a few outliers
* */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE;
locations++;
*locations = rand()%RIVSIZE;
locations++;
}
return;
}
int fLexPush(denseRIV RIVout){
char pathString[200] = {0};
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "lexicon/%s", RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
if(temp.count<(RIVSIZE/2)){
/* smaller stored as sparse vector */
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
}else{
/* saturation is too high, better to store dense */
/* there's gotta be a better way to do this */
temp.count = 0;
fwrite(&temp.count, 1, sizeof(int), lexWord);
fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
}
fclose(lexWord);
free(RIVout.values);
free(temp.locations);
return 0;
}
denseRIV fLexPull(FILE* lexWord){
denseRIV output = denseAllocate();
int typeCheck;
/* get metadata for vector */
fread(&typeCheck, 1, sizeof(unsigned int), lexWord);
fread(output.frequency, 1, sizeof(int), lexWord);
fread(&(output.magnitude), 1, sizeof(int), lexWord);
/* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){
/* pull as sparseVector */
sparseRIV temp;
/* value was not 0, so it's the value count */
temp.count = typeCheck;
temp.locations = (int*)malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
fread(temp.locations, temp.count, sizeof(int), lexWord);
fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output.values, temp);
free(temp.locations);
}else{
/* typecheck is thrown away, just a flag in this case */
fread(output.values, RIVSIZE, sizeof(int), lexWord);
}
output.cached = 0;
return output;
}
void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
int cacheDump(){
int i=0;
int j=0;
int flag = 0;
denseRIV* cache_slider = RIVKey.RIVCache;
denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE;
while(cache_slider<cache_stop){
if((*cache_slider).cached){
j++;
flag += fLexPush(*cache_slider);
}
else{
i++;
}
cache_slider++;
}
printf("%d cacheslots unused\n%d, cacheslots used", i, j);
return flag;
}
denseRIV denseAllocate(){
/* allocates a 0 vector */
denseRIV output;
output.values = (int*)calloc(RIVSIZE+1, sizeof(int));
/* for compact memory use, frequency is placed immediately after values */
output.frequency = output.values+RIVSIZE;
output.magnitude = 0;
output.cached = 0;
return output;
}
/*TODO add a simplified free function*/
#endif #endif
...@@ -7,4 +7,24 @@ int isWordClean(char* word); ...@@ -7,4 +7,24 @@ int isWordClean(char* word);
/* used by wordClean */ /* used by wordClean */
int isLetter(char c); int isLetter(char c);
int isLetter(char c){
if((c>96 && c<123)||(c == 32) || (c == '_')) return 1;
else return 0;
}
int isWordClean(char* word){
char *letter = word;
char *word_stop = word+99;
while(letter<word_stop){
if(!(*letter)) break;
if(!(isLetter(*letter))){
return 0;
}
letter++;
}
return 1;
}
#endif #endif
File deleted
File added
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 25000
#define CACHESIZE 0
#define NONZEROS 2
#define THRESHOLD 0.70
#include "RIVtools.h"
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
static void HandleError(cudaError_t err, const char *file, int line){
if(err !=cudaSuccess)
{
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
__global__ void d_mapS2D(int *d_denseSlot, int *d_sparseSlot, int count){
int id = blockIdx.x*blockDim.x + threadIdx.x;
if(!id<count) return;
int *target = d_sparseSlot+id;
d_denseSlot[*target] = *(target+count);
}
__global__ void cosines(int* d_denseBase, int* d_sparseBlock, int* output, int RIVcount){
int id =blockIdx.x*blockDim.x + threadIdx.x;
if(id>=RIVcount) return;
int count = *(d_sparseBlock+RIVSIZE*id);
int *locations = &count+1;
int *values = locations+count;
int dot = 0;
output+=id;
while(count--){
dot+= values[count]*d_denseBase[locations[count]];
}
*output = dot;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
float** cosineMatrix(sparseRIV* RIVs, int RIVcount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
RIVInit();
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
sparseRIV* fileRIVs_slider = fileRIVs;
sparseRIV* fileRIVs_stop = fileRIVs+fileCount;
while(fileRIVs_slider <fileRIVs_stop){
(*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider);
fileRIVs_slider++;
}
clock_t beginnsquared = clock();
float cosine;
float minmag;
float maxmag;
denseRIV baseDense;
baseDense.values = (int*)malloc(RIVSIZE*sizeof(int));
fileRIVs_slider = fileRIVs;
sparseRIV* comparators_slider;
int count = 0;
cosineMatrix(fileRIVs, fileCount);
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("\nnsquared time:%lf\n\n", time);
printf("\ncosines: %d \n", count);
printf("\nsims: %d \n", RIVKey.thing);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
free(fileRIVs);
return 0;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input);
(*fileCount)++;
}
}
}
float** cosineMatrix(sparseRIV* RIVs, int RIVcount){
int *d_massiveBlock;
cudaMalloc((void**)&d_massiveBlock, 100000000*sizeof(int));
int *d_outputSlot = d_massiveBlock;
int *d_denseSlot = d_outputSlot+(RIVcount*RIVcount/2);
int *d_sparseSection =d_denseSlot+RIVSIZE;
int *d_sparse_slider = d_sparseSection;
for(int i=0; i<RIVcount; i++){
HANDLE_ERROR (cudaMemcpy (d_sparse_slider++, &RIVs[i].count, sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMemcpy (d_sparse_slider, RIVs[i].locations, RIVs[i].count*2*sizeof(int), cudaMemcpyHostToDevice));
d_sparse_slider+=RIVs[i].count*2;
}
}
File deleted
File deleted
...@@ -47,4 +47,289 @@ sparseRIV consolidateI2S(int *implicit, size_t valueCount); ...@@ -47,4 +47,289 @@ sparseRIV consolidateI2S(int *implicit, size_t valueCount);
sparseRIV text2L2(char *text); sparseRIV text2L2(char *text);
float getMagnitudeSparse(sparseRIV input); float getMagnitudeSparse(sparseRIV input);
sparseRIV text2L2(char *text){
unsigned int blockSize;
char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
int displacement;
while(sscanf(text, "%99s%n", word, &displacement)){
text += displacement+1;
if(!displacement){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations((unsigned char*)word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file, untill frequency
* is needed to hold some more useful data point */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2(FILE *data){
unsigned int blockSize;
unsigned char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2Clean(FILE *data){
unsigned char word[100] = {0};
int *locations = RIVKey.h_tempBlock;
unsigned int blockSize;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
/* if the word is not clean, skip it */
if(!isWordClean((char*)word)){
continue;
}
blockSize = locationCount+NONZEROS;
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount<RIVKey.I2SThreshold){
/*direct method is faster on small datasets, but has geometric scaling on large datasets */
return consolidateI2SDirect(implicit, valueCount);
}else{
/* optimized for large datasets */
return consolidateI2SIndirect(implicit, valueCount);
}
}
void aggregateWord2D(denseRIV destination, char* word){
srand(wordtoSeed((unsigned char*)word));
for(int i=0; i<NONZEROS; i++){
destination.values[(rand()%RIVSIZE)] +=1;
destination.values[(rand()%RIVSIZE)] -= 1;
}
}
float cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0;
int n = comparator.count;
while(n){
n--;
/* we calculate the dot-product to derive the cosine
* comparing sparse to dense by index*/
//dot += values[i]*baseRIV.values[locations[i]];
dot += comparator.values[n] * baseRIV.values[comparator.locations[n]];
//printf("%d, %d, %d\n",baseRIV.values[comparator.locations[n]],comparator.values[n] , n);
}
/*dot divided by product of magnitudes */
float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
return cosine;
}
float getMagnitudeSparse(sparseRIV input){
unsigned long long int temp = 0;
int *values = input.values;
int *values_stop = values+input.count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
input.magnitude = sqrt(temp);
return input.magnitude;
}
denseRIV lexPull(char* word){
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
srand(wordtoSeed((unsigned char*)word));
int hash = rand()%CACHESIZE;
if(!strcmp(word, RIVKey.RIVCache[hash].name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
#endif /* CACHESIZE > 0 */
/* if not, attempt to pull the word data from lexicon file */
denseRIV output;
char pathString[200];
sprintf(pathString, "lexicon/%s", word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons?
output = denseAllocate();
}
strcpy(output.name, word);
return output;
}
int lexPush(denseRIV RIVout){
#if CACHESIZE == 0
/* if there is no cache, simply push to file */
fLexPush(RIVout);
return 0;
#else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated
* to the cache or the RIV was pushed out from under it,
* in which case it has already been pushed! move on*/
if(RIVout.cached){
return 0;
}
srand(wordtoSeed((unsigned char*)RIVout.name));
int hash = rand()%CACHESIZE;
if(!RIVKey.RIVCache[hash].cached){
/* if there is no word in this cache slot, push to cache instead of file */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return 0;
/*if the current RIV is more frequent than the RIV holding its slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
/* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]);
/* push the current RIV to cache */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return diag;
}else{
/* push current RIV to file */
fLexPush(RIVout);
}
return 0;
#endif /* CACHESIZE == 0 */
}
sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0};
denseRIV denseTemp;
// a temporary dense RIV is stored in the tempBlock
denseTemp.values = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
int count = 0;
while(fscanf(data, "%99s", word)){
count++;
if(feof(data)){
break;
}
if(!(*word)){
break;
}
// add word's L1 RIV to the accumulating implicit RIV
aggregateWord2D(denseTemp, (char*)word);
}
sparseRIV output = consolidateD2S(denseTemp.values);
// frequency records the number of words in this file
output.frequency = count;
output.boolean = 1;
return output;
}
#endif #endif
#include <stdio.h>
#include <stdlib.h>
#include <strsafe.h>
#define SEEDMASK 25214903917
struct RIVData{
int RIVsize;
int nonZeros;
long long int *masks;
int *h_tempBlock;
int *h_stagingBlock;
int *h_staging_slider;
int *h_staging_stop;
int *h_displacements;
int *d_OpenSlot;
int *d_SlotEnd;
float *d_magnitudes;
int thing;
}RIVKeyData;
typedef struct{
char name[100];
int *values;
int *locations;
int count;
int frequency;
float magnitude;
int boolean;
}sparseRIV;
sparseRIV FileToL2(FILE *data);
void consolidateD2S(sparseRIV *destination, int *denseInput);
void setKeyData(int RIVsize, int nonZeros, int blockSize);
int* mapS2D(int * destination, sparseRIV input);
int* makeSparseLocations(int *seeds, int seedCount);
void makeSeeds(unsigned char* word, int **seeds, int *seedCount);
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold);
void getMagnitudes(sparseRIV *inputs, int RIVCount);
int *mapI2D(int *locations, int seedCount);
sparseRIV text2L2(unsigned char *text);
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);
sparseRIV FileToL2(FILE *data){
unsigned char *word = (unsigned char*)calloc(2000, 1);
int *seeds = RIVKeyData.h_tempBlock;
int seedCount = 0;
while(fscanf(data, "%s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
makeSeeds(word, &seeds, &seedCount);
memset(word, 0, 2000);
}
int *locations = makeSparseLocations(seeds, seedCount);
//printf("mcshittles");
int *L2dense;
L2dense = mapI2D(locations, seedCount);
sparseRIV output;
//printf("tits");
consolidateD2S( &output, L2dense);
free(L2dense);
output.boolean = 1;
RIVKeyData.thing++;
return output;
}
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold){
int *baseDenseRIV = RIVKeyData.h_tempBlock;
mapS2D(baseDenseRIV, baseRIV);
float *outputs = (float*)malloc((multiplierCount)* sizeof(float));
float *output_slider = outputs;
sparseRIV *multipliersStop = multipliers+multiplierCount;
float minsize = baseRIV.magnitude * .75;
float maxsize = baseRIV.magnitude * 1.25;
while(multipliers<multipliersStop){
if(((*multipliers).boolean) /*&& (((*multipliers).magnitude < maxsize) && ((*multipliers).magnitude > minsize))*/){
int dot = 0;
int *values = (*multipliers).values;
int *locations = (*multipliers).locations;
int *locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){
dot += (*values)*(*(baseDenseRIV+(*locations)));
locations++;
values++;
}
*output_slider= dot/((baseRIV.magnitude)*((*multipliers).magnitude));
if(*output_slider>=threshold){
printf("%s\t%s\n%f\n", (*multipliers).name, baseRIV.name, *output_slider);
(*multipliers).boolean = 0;
//RIVKeyData.thing ++;
}
}
multipliers++;
output_slider++;
}
return outputs;
}
void getMagnitudes(sparseRIV *inputs, int RIVCount){
for(int i=0; i<RIVCount; i++){
int temp = 0;
int *values = inputs[i].values;
int *values_stop = values+inputs[i].count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
float magnitude = sqrt(temp);
inputs[i].magnitude = magnitude;
//printf("magnitude = %f, \n", magnitude);
}
}
int* mapS2D(int* destination, sparseRIV input){
memset(destination, 0, RIVKeyData.RIVsize*sizeof(int));
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
while(locations_slider<locations_stop){
destination[*locations_slider] = *values_slider;
locations_slider++;
values_slider++;
}
//HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_OpenSlot, destination, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyHostToDevice));
return destination;
}
int* mapI2D(int *locations, int valueCount){
int *destination = (int*)calloc(RIVKeyData.RIVsize,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
int value = 1;
while(locations_slider<locations_stop){
destination[*locations_slider] +=value;
locations_slider++;
value = (value == 1)? -1: 1;
}
return destination;
}
void consolidateD2S(sparseRIV *destination, int *denseInput){
int count = 0;
(*destination).locations = (int*) malloc(RIVKeyData.RIVsize*sizeof(int));
(*destination).values = (int*) malloc(RIVKeyData.RIVsize*sizeof(int));
for(int i=0; i<RIVKeyData.RIVsize; i++){
if(denseInput[i]){
(*destination).locations[count] = i;
(*destination).values[count] = denseInput[i];
count++;
}
}
destination->count = count;
(*destination).locations = (int*) realloc((*destination).locations, (*destination).count*sizeof(int));
(*destination).values = (int*) realloc((*destination).values, (*destination).count*sizeof(int));
}
void setKeyData(int RIVsize, int nonZeros, int blockSize){
RIVKeyData.RIVsize = RIVsize;
if(nonZeros%2){
printf("your nonZeros must be an even number");
nonZeros++;
printf(", changed to %d", nonZeros);
}
RIVKeyData.nonZeros = nonZeros;
RIVKeyData.masks = (long long int*)malloc(nonZeros*sizeof(long long int));
for(int i = 0; i<nonZeros; i++){
RIVKeyData.masks[i] = SEEDMASK>>(5*i);
}
RIVKeyData.h_tempBlock = (int*)malloc(blockSize*sizeof(int));
//RIVKeyData.h_stagingBlock = (int*)malloc(blockSize*sizeof(int));
//RIVKeyData.h_staging_slider = RIVKeyData.h_stagingBlock;
RIVKeyData.thing = 0;
}
void makeSeeds(unsigned char* word, int **seeds, int *seedCount){
int i=0;
int seedbase = 0;
while(*word){
seedbase += (*(word))<<(i*5);
word++;
i++;
}
int *seedTrack = (*seeds)+*seedCount;
for(i =0 ; i<RIVKeyData.nonZeros; i++){
*seedTrack = (seedbase>>i)+(3*i);
seedTrack++;
}
*seedCount+=RIVKeyData.nonZeros;
return;
}
int* makeSparseLocations(int* seeds, int seedCount){
int *locations = RIVKeyData.h_tempBlock;
int *locations_slider = locations;
int *seeds_stop = seeds+seedCount;
long long int *mask = RIVKeyData.masks;
long long int *mask_stop = mask+RIVKeyData.nonZeros;
while(seeds<seeds_stop){
*locations_slider =(((*seeds)^(*mask)) & 2147483647) %(RIVKeyData.RIVsize);
mask++;
locations_slider++;
seeds++;
if(!(mask<mask_stop)) mask-=RIVKeyData.nonZeros;
}
return locations;
}
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word){
unsigned char *word_slider = word;
while(*(*string)){
if(*(*string) == ' ') {
(*string)++;
break;
}
*word_slider = *(*string);
word_slider++;
(*string)++;
}
*word_slider = 0;
return word;
}
sparseRIV text2L2(unsigned char *text){
unsigned char *word = (unsigned char*)calloc(2000, 1);
int *seeds = ( int*)malloc(RIVKeyData.nonZeros*sizeof( int));
unsigned char *text_slider = text;
int seedCount = 0;
while(*text_slider){
sscanAdvance(&text_slider, word);
makeSeeds(word, &seeds, &seedCount);
memset(word, 0, 2000);
}
int *locations = makeSparseLocations(seeds, seedCount);
int *L2dense;
L2dense = mapI2D(locations, seedCount);
free(locations);
sparseRIV output;
consolidateD2S(&output, L2dense);
free(seeds);
return output;
}
File deleted
makefile : RIVcullGPU.cu RIVtools.c RIVLower.c RIVaccessories.c
nvcc RIVcullGPU.cu -o RIVcullG RIVtools.o RIVLower.o RIVaccessories.o -lm -O3
fileCount: 1000
../backup/0/doj_civ_fraud_enron4_0039ca34-8a9d-4327-ba3a-dffeef8297f8.txt ../backup/0/doj_civ_fraud_enron4_0030b5f1-4923-4ac5-8070-d0d459a55887.txt
0.763783
../backup/0/doj_civ_fraud_enron4_0068bc6a-ed41-47d8-afdb-3c98ca2c8a7e.txt ../backup/0/doj_civ_fraud_enron4_00682721-2d07-4a71-874e-d4f7d550fb68.txt
1.000000
../backup/0/doj_civ_fraud_enron4_0069a141-76bb-4f47-bbaa-1af8211bc783.txt ../backup/0/doj_civ_fraud_enron4_000fc924-6a76-4073-ae68-03feb4dfea7f.txt
1.000000
../backup/0/doj_civ_fraud_enron4_0071ed60-8e18-4dd8-8a14-69efa97991e6.txt ../backup/0/doj_civ_fraud_enron4_00286f4a-ffe5-4865-b6c8-22e8e6470a17.txt
0.765256
../backup/0/doj_civ_fraud_enron4_0076a278-13dd-4570-bd1b-a8667ff5a1e5.txt ../backup/0/doj_civ_fraud_enron4_001a9f7f-901d-4239-a6ce-045a75cacbad.txt
0.723166
../backup/0/doj_civ_fraud_enron4_007705c9-8a68-433b-b190-b596e0af5a44.txt ../backup/0/doj_civ_fraud_enron4_00564538-9215-4be1-9e5e-9c48c4891692.txt
0.711199
../backup/0/doj_civ_fraud_enron4_007e7a46-6d1f-4809-8bcd-8616a14a5ed2.txt ../backup/0/doj_civ_fraud_enron4_001ba443-8aa1-4901-99fa-769c921b7c97.txt
1.000000
../backup/0/doj_civ_fraud_enron4_0083815e-7e27-4e27-952a-beabf0eb2870.txt ../backup/0/doj_civ_fraud_enron4_007d7b1b-20f6-46e0-b68a-f808c72a68ee.txt
0.999466
../backup/0/doj_civ_fraud_enron4_00a6e533-5b54-4899-9247-04cd51eb7b06.txt ../backup/0/doj_civ_fraud_enron4_00537b5e-e8d2-4e7d-865c-d778fdf2388c.txt
0.742052
../backup/0/doj_civ_fraud_enron4_00ad70b2-0b8f-4f26-ad9b-294e9f80240f.txt ../backup/0/doj_civ_fraud_enron4_0071ed60-8e18-4dd8-8a14-69efa97991e6.txt
0.821186
../backup/0/doj_civ_fraud_enron4_00b2b937-21c5-452f-9540-f3d27e2caa91.txt ../backup/0/doj_civ_fraud_enron4_00aac34b-b402-4fb8-9f87-7e7ae7deff7d.txt
0.709276
../backup/0/doj_civ_fraud_enron4_00b3f2e0-cc84-4b30-8adb-15cee6374b80.txt ../backup/0/doj_civ_fraud_enron4_00713878-1dec-4366-b9ec-01effd4d4dab.txt
0.703661
../backup/0/doj_civ_fraud_enron4_00bfe59d-cb76-447b-b537-6f8beaecc4c3.txt ../backup/0/doj_civ_fraud_enron4_0049e8fe-5195-4331-9a2b-278ee42678a0.txt
0.936382
../backup/0/doj_civ_fraud_enron4_00c574de-834d-4f72-87b6-08fdbc608357.txt ../backup/0/doj_civ_fraud_enron4_000f0eea-05ef-490c-b52e-6030e5e9dc72.txt
1.000000
../backup/0/doj_civ_fraud_enron4_00c5f62a-92c7-4292-aa24-6c400328903a.txt ../backup/0/doj_civ_fraud_enron4_00ad19db-267d-495b-b45f-21090d0ef147.txt
0.962357
../backup/0/doj_civ_fraud_enron4_00ce86f8-a931-4be3-ac2d-cc69edfd781a.txt ../backup/0/doj_civ_fraud_enron4_003d83a6-923c-48f1-ac72-b74367fd120d.txt
0.714172
../backup/0/doj_civ_fraud_enron4_00ce86f8-a931-4be3-ac2d-cc69edfd781a.txt ../backup/0/doj_civ_fraud_enron4_0045c52f-e8d2-430b-a2fd-42876223b9f0.txt
0.723129
../backup/0/doj_civ_fraud_enron4_00db9dff-0ba0-4e70-93de-a751e5e34cd7.txt ../backup/0/doj_civ_fraud_enron4_00c7da83-8dc2-4418-a448-f574fc86f61a.txt
0.722726
../backup/0/doj_civ_fraud_enron4_00ddb089-dedb-4cc4-a8ba-f89de10d8e5c.txt ../backup/0/doj_civ_fraud_enron4_00a6e533-5b54-4899-9247-04cd51eb7b06.txt
0.718547
../backup/0/doj_civ_fraud_enron4_00ddb089-dedb-4cc4-a8ba-f89de10d8e5c.txt ../backup/0/doj_civ_fraud_enron4_00c113f9-ac53-4492-a743-1b72236407aa.txt
0.770782
../backup/0/doj_civ_fraud_enron4_00e27fb1-0685-4e33-b4cc-3cad999b45e5.txt ../backup/0/doj_civ_fraud_enron4_00d76bea-9be0-4a6b-8b1d-fc8677ea8827.txt
1.000000
../backup/0/doj_civ_fraud_enron4_00f080b9-460c-4eb1-a47a-0372acb6a555.txt ../backup/0/doj_civ_fraud_enron4_002290b5-791a-4f5c-9527-c929869d64c4.txt
0.722857
../backup/0/doj_civ_fraud_enron4_00ff57e6-64fe-40d1-b97f-61c1f2917be3.txt ../backup/0/doj_civ_fraud_enron4_00ddb089-dedb-4cc4-a8ba-f89de10d8e5c.txt
0.759524
nsquared time:0.008044
23 <total time:0.285595
File deleted
File deleted
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment