Commit 7c37cc43 by Ethan

updated and commented

parent 41cdd603
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically
*/
#ifndef RIVSIZE
#define RIVSIZE 25000
#endif
/* NONZeros macro defines the number of non-zero values that will be generated
* for any level one (barcode) RIV. 2 is simple and lightweight to begin
*/
#ifndef NONZEROS
#define NONZEROS 2
#endif
/* CACHESIZE macro defines the number of RIVs the system will cache.
* a larger cache means more memory consumption, but will also be significantly
* faster in aggregation and reading applications. doesn't affect systems
* that do not use lexpull/push
*/
#ifndef CACHESIZE
#define CACHESIZE 20
#endif
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation.
* specifically, a sparseRIV contains a pair of arrays,
* containing locations and values, where pairs are found in like array
* indices.
*/
typedef struct{
char name[100];
int *values;
int *locations;
size_t count;
unsigned int frequency;
float magnitude;
int boolean;
}sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector
* math, as comparisons and arithmetic between vectors are ideally
* performed between sparse and dense (hetero-arithmetic)
*/
typedef struct{
char name[100];
int* values;
int* frequency;
float magnitude;
}denseRIV;
/*RIVKey, holds globally important data that should not be changed partway through
* first function call in the program should always be:
* RIVinit();
* this will set these variables, check for incompatible choices, and open up
* memory blocks which the system will use in the background
*/
struct RIVData{
size_t RIVsize;
int nonZeros;
int *h_tempBlock;
int tempSize;
int thing;
denseRIV* RIVCache;
int cacheSize;
}static RIVKey;
/* RIVinit should be the first function called in any usage of this library
* it sets global variables that practically all functions will reference,
* it checks that your base parameters are valid, and allocates memory for
* the functions to use, so that we can move fast with rare allocations.
* #TODO add signal redefinitions so that cache is saved even on crash
*/
void RIVinit();
/* RIVCleanup should always be called to close a RIV program. it frees
* blocks allocated by RIVinit, and dumps the cached data to appropriate lexicon files
*/
void RIVCleanup();
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does nto automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't
* need to carry metadata
*/
sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
/* mapS2D expands a sparseRIV out to denseRIV values, filling array locations
* based on location-value pairs
*/
int* mapS2D(int* destination, sparseRIV input); //#TODO fix int*/denseRIV confusion
/* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file
* this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV.
*/
void makesparseLocations(unsigned char* word, int *seeds, size_t seedCount);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexpush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int fLexPush(denseRIV RIVout);
int wordtoSeed(unsigned char* word);
/* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering
*/
int* mapI2D(int *locations, size_t seedCount);
/* begin definitions */
int* mapS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
/* make sure our destination is a 0 vector */
memset(destination, 0, RIVKey.RIVsize*sizeof(int));
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] = *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVKey.RIVsize,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock;
int* values = RIVKey.h_tempBlock+RIVKey.RIVsize;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVKey.RIVsize; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
return output;
}
void RIVinit(){
RIVKey.RIVsize = RIVSIZE; //#TODO decide about macros vs global variables
RIVKey.nonZeros = NONZEROS;
if(RIVKey.nonZeros%2){
printf("your NONZEROS value must be an even number");
RIVKey.nonZeros++;
printf(", changed to %d", RIVKey.nonZeros);
}
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
RIVKey.h_tempBlock = (int*)malloc(3*RIVKey.RIVsize*sizeof(int));
RIVKey.tempSize = 3*RIVKey.RIVsize;
RIVKey.thing = 0;
RIVKey.cacheSize = CACHESIZE;
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
RIVKey.RIVCache = (denseRIV*)calloc(RIVKey.cacheSize,sizeof(denseRIV));
}
void RIVCleanup(){
for(int i=0; i<RIVKey.cacheSize; i++){
fLexPush(RIVKey.RIVCache[i]);
}
#if CACHESIZE > 0
free(RIVKey.RIVCache);
#endif
free(RIVKey.h_tempBlock);
}
int wordtoSeed(unsigned char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count;
srand(wordtoSeed(word));
for(int i=0; i<RIVKey.nonZeros; i++){
/* unrolled for speed, gauranteed to be an even number of steps */
*locations = rand()%RIVKey.RIVsize;
locations++;
i++;
*locations = rand()%RIVKey.RIVsize;
locations++;
}
return;
}
int fLexPush(denseRIV RIVout){
char pathString[500] = {0};
/* word data will be placed in a (new?) file under the lexicon directory
* and named after the word itself */
sprintf(pathString, "lexicon/%s", RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
fwrite(RIVout.frequency, 1, 4, lexWord);
fwrite(&RIVout.magnitude, 1, 4, lexWord);
fwrite(RIVout.values, RIVKey.RIVsize, 4, lexWord);
fclose(lexWord);
free(RIVout.values);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
#include <error.h>
#include <time.h>
#define RIVSIZE 5000
#define CACHESIZE 0
#include "RIVtoolsCPUlinux.h"
#define THRESHOLD .80f
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int printname(float cosine, sparseRIV base, sparseRIV multiplier);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
RIVinit();
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <1){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
getMagnitudes(fileRIVs, fileCount);
clock_t beginnsquared = clock();
printf("got past magnitudes");
for(int i=0; i<fileCount; i++){
if(fileRIVs[i].boolean){
cosineCompare(fileRIVs[i], fileRIVs+i+1, fileCount-(i+1), printname);
}
}
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("nsquared time:%lf\n\n", time);
printf("%d <", RIVKey.thing);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
free(fileRIVs);
return 0;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2Clean(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input);
(*fileCount)++;
}
}
}
int printname(float cosine, sparseRIV base, sparseRIV multiplier){
if(cosine>= THRESHOLD){
printf("%s\t%s\n%f\n", base.name, multiplier.name, cosine);
multiplier.boolean = 0;
RIVKey.thing++;
return 0;
}
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define CACHESIZE 100
#include "RIVtoolsCPUlinux.h"
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
#include <error.h>
void fileGrind(FILE* textFile);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void directoryGrind(char *rootString);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
setKeyData();
char pathString[1000];
strcpy(pathString, argv[1]);
strcat(pathString, "/");
directoryGrind(pathString);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
for( int i=0; i<RIVKey.cacheSize; i++){
printf("%s, %d", RIVKey.RIVCache[i].name, *(RIVKey.RIVCache[i].frequency));
printf("\n");
}
RIVCleanup();
return 0;
}
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
denseRIV *denseSet_slider;
denseRIV *dense_stop = denseSet+RIVCount;
int *locations = additive.locations;
int *locations_stop = locations+additive.count;
int *values = additive.values;
//int *target;
while(locations<locations_stop){
denseSet_slider = denseSet;
while(denseSet_slider<dense_stop){
(*denseSet_slider).values[*locations]+= *values;
//*target+=*values;
denseSet_slider++;
}
locations++;
values++;
}
}
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
denseRIV* RIVStop = RIVSet+wordCount;
while(RIVSet<RIVStop){
if(!strcmp(word, RIVSet->name)){
return 1;
}
RIVSet++;
}
return 0;
}
void directoryGrind(char *rootString){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
while(!strcmp(files->d_name, ".") || !strcmp(files->d_name, "..")){
files = readdir(directory);
}
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryGrind(pathString);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
printf("%s\n", pathString);
FILE *input = fopen(pathString, "r+");
if(input){
fileGrind(input);
fclose(input);
}
}
}
void fileGrind(FILE* textFile){
sparseRIV aggregateRIV = fileToL2Clean(textFile);
fseek(textFile, 0, SEEK_SET);
int wordCount = 0;
denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
char word[200];
while(fscanf(textFile, "%99s", word)){
if(feof(textFile)) break;
if(!(*word))continue;
if(!isWordClean((char*)word)){
continue;
}
if(!checkDupe(RIVArray, word, wordCount)){
RIVArray[wordCount] = lexPull(word);
if(!*((RIVArray[wordCount].name))) break;
int* thing = RIVArray[wordCount].frequency;
*thing = *thing + 1;
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
wordCount++;
}
}
//printf("%d\n", wordCount);
addS2Ds(RIVArray, aggregateRIV, wordCount);
denseRIV* RIVArray_slider = RIVArray;
denseRIV* RIVArray_stop = RIVArray+wordCount;
while(RIVArray_slider<RIVArray_stop){
lexPush(*RIVArray_slider);
RIVArray_slider++;
}
free(RIVArray);
free(aggregateRIV.locations);
free(aggregateRIV.values);
}
...@@ -2,138 +2,126 @@ ...@@ -2,138 +2,126 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <math.h> #include <math.h>
#include "RIVLower.h"
#define SEEDMASK 25214903917
struct RIVData{ void cosineCompareUnbound(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, float threshold);
int RIVsize; /*lexPush writes a denseRIV to a file of the same name, under the directory "lexicon"
int nonZeros; * it is up to the programmer to ensure that the name of the RIV is a valid filename
long long int *masks; * although it will of course attempt to create the file if it does not exist
int *h_tempBlock; */
int *h_stagingBlock;
int *h_staging_slider;
int *h_staging_stop;
int *h_displacements;
int *d_OpenSlot;
int *d_SlotEnd;
float *d_magnitudes;
int thing;
}RIVKey;
typedef struct{
char name[100];
int *values;
int *locations;
int count;
int frequency;
float magnitude;
int boolean;
}sparseRIV;
typedef struct{
char name[100];
int *values;
int frequency;
float magnitude;
}denseRIV;
int lexPush(denseRIV RIVout); int lexPush(denseRIV RIVout);
denseRIV lexPull(int *valuesOut, char* word); /* lexPull reads an existing lexicon entry (under directory "lexicon")
* and creates a denseRIV with those attributes.
* if the file does not exist, it creates a 0 vector with the name of word
*/
denseRIV lexPull(char* word);
/*isWordClean filters words that contain non-letter characters, and
* upperCase letters, allowing only the '_' symbol through
*/
int cacheHash (char* word);
int isWordClean(char* word); int isWordClean(char* word);
int isLetter(char c); int isLetter(char c);
sparseRIV FileToL2(FILE *data); /* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
sparseRIV FileToL2Clean(FILE *data); * and returns a sparse RIV which is the vector sum of the base RIVs of each
sparseRIV consolidateD2S(int *denseInput); * word contained
void setKeyData(int RIVsize, int nonZeros, int blockSize); */
int* mapS2D(int * destination, sparseRIV input); sparseRIV fileToL2(FILE *input);
int* makeSparseLocations(int *seeds, int seedCount); /* fileToL2Clean operates the same as fileToL2 butkeeps only words
void makeSeeds(unsigned char* word, int **seeds, int *seedCount); * containing lowercase letters and the '_' symbol
void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold); * this is important if you will be lexPush-ing those words later
void getMagnitudes(sparseRIV *inputs, int RIVCount); */
int *mapI2D(int *locations, int seedCount); sparseRIV fileToL2Clean(FILE *data);
sparseRIV text2L2(unsigned char *text); void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, int (*action)(float cosine, sparseRIV base, sparseRIV multiplier));
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word); void getMagnitudes(sparseRIV *inputs, size_t RIVCount);
sparseRIV text2L2(unsigned char *text);//unused
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);//unused
sparseRIV FileToL2(FILE *data){ sparseRIV fileToL2(FILE *data){
unsigned int blockSize;
unsigned char word[2000] = {0};
int *seeds = RIVKey.h_tempBlock;
int seedCount = 0; unsigned char word[100] = {0};
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
while(fscanf(data, "%s", word)){ while(fscanf(data, "%99s", word)){
if(feof(data)){ if(feof(data)){
break; break;
} }
if(!(*word)){ if(!(*word)){
break; break;
}
blockSize = locationCount+RIVKey.nonZeros;
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=RIVKey.nonZeros;
} }
makeSeeds(word, &seeds, &seedCount); makeSparseLocations(word, locations, locationCount);
locationCount++;
} }
int *locations = makeSparseLocations(seeds, seedCount);
int *L2dense; int *L2dense;
L2dense = mapI2D(locations, seedCount); L2dense = mapI2D(locations, locationCount);
sparseRIV output = consolidateD2S(L2dense); sparseRIV output = consolidateD2S(L2dense);
free(L2dense); free(L2dense);
output.frequency = seedCount/RIVKey.nonZeros; output.frequency = locationCount/RIVKey.nonZeros;
output.boolean = 1; output.boolean = 1;
return output; return output;
} }
sparseRIV FileToL2Clean(FILE *data){
sparseRIV fileToL2Clean(FILE *data){
unsigned char word[100] = {0}; unsigned char word[100] = {0};
int *seeds = RIVKey.h_tempBlock; int *locations = RIVKey.h_tempBlock;
unsigned int blockSize;
int seedCount = 0; int locationCount = 0;
while(fscanf(data, "%100s", word)){ while(fscanf(data, "%99s", word)){
if(feof(data)){ if(feof(data)){
break; break;
} }
if(!(*word)){ if(!(*word)){
break; break;
} }
if(!isWordClean((char*)word)) continue; blockSize = locationCount+RIVKey.nonZeros;
makeSeeds(word, &seeds, &seedCount); if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=RIVKey.nonZeros;
}
makeSparseLocations(word, locations, locationCount);
locationCount+= RIVKey.nonZeros;
} }
int *locations = makeSparseLocations(seeds, seedCount);
int *L2dense; int *L2dense;
L2dense = mapI2D(locations, seedCount); L2dense = mapI2D(locations, locationCount);
sparseRIV output = consolidateD2S(L2dense); sparseRIV output = consolidateD2S(L2dense);
free(L2dense); free(L2dense);
output.frequency = seedCount/RIVKey.nonZeros; output.frequency = locationCount/RIVKey.nonZeros;
output.boolean = 1; output.boolean = 1;
return output; return output;
} }
void cosineCompareUnbound(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, float threshold){
void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold){
int *baseDenseRIV = RIVKey.h_tempBlock; int *baseDenseRIV = RIVKey.h_tempBlock;
mapS2D(baseDenseRIV, baseRIV); mapS2D(baseDenseRIV, baseRIV);
float cosSim; float cosSim;
sparseRIV *multipliersStop = multipliers+multiplierCount; sparseRIV *multipliersStop = multipliers+multiplierCount;
float minsize = baseRIV.magnitude * .75;
float maxsize = baseRIV.magnitude * 1.25;
while(multipliers<multipliersStop){ while(multipliers<multipliersStop){
if(((*multipliers).boolean)/* && (((*multipliers).magnitude < maxsize) && ((*multipliers).magnitude > minsize))*/){ if((*multipliers).boolean){
int dot = 0; int dot = 0;
int *values = (*multipliers).values; int *values = (*multipliers).values;
int *locations = (*multipliers).locations; int *locations = (*multipliers).locations;
...@@ -146,22 +134,56 @@ void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCoun ...@@ -146,22 +134,56 @@ void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCoun
values++; values++;
} }
cosSim= dot/((baseRIV.magnitude)*((*multipliers).magnitude)); cosSim= dot/((baseRIV.magnitude)*((*multipliers).magnitude));
//if(cosSim>=threshold){ if(cosSim>=threshold){
printf("#######%s\t%s\n%f\n", (*multipliers).name, baseRIV.name, cosSim); printf("%s\t%s\n%f\n", (*multipliers).name, baseRIV.name, cosSim);
(*multipliers).boolean = 0; (*multipliers).boolean = 0;
RIVKey.thing ++; RIVKey.thing ++;
scanf("%d", &RIVKey.thing); scanf("%d", &RIVKey.thing);
//} }
} }
multipliers++; multipliers++;
} }
}
void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, int (*action)(float cosine, sparseRIV base, sparseRIV multiplier)){
int *baseDenseRIV = RIVKey.h_tempBlock;
mapS2D(baseDenseRIV, baseRIV);
float cosSim;
sparseRIV *multipliersStop = multipliers+multiplierCount;
float minsize = baseRIV.magnitude * .85;
float maxsize = baseRIV.magnitude * 1.15;
int dot = 0;
int *values;
int *locations;
int *locations_Stop;
while(multipliers<multipliersStop){
if(((*multipliers).boolean)
&& (((*multipliers).magnitude < maxsize)
&& ((*multipliers).magnitude > minsize))){
dot = 0;
values = (*multipliers).values;
locations = (*multipliers).locations;
locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){
dot += (*values)*(*(baseDenseRIV+(*locations)));
locations++;
values++;
}
cosSim= dot/((baseRIV.magnitude)*((*multipliers).magnitude));
action(cosSim, baseRIV, (*multipliers));
}
multipliers++;
}
} }
void getMagnitudes(sparseRIV *inputs, int RIVCount){ void getMagnitudes(sparseRIV *inputs, size_t RIVCount){
for(int i=0; i<RIVCount; i++){ for(int i=0; i<RIVCount; i++){
int temp = 0; unsigned int temp = 0;
int *values = inputs[i].values; int *values = inputs[i].values;
int *values_stop = values+inputs[i].count; int *values_stop = values+inputs[i].count;
while(values<values_stop){ while(values<values_stop){
...@@ -171,131 +193,9 @@ void getMagnitudes(sparseRIV *inputs, int RIVCount){ ...@@ -171,131 +193,9 @@ void getMagnitudes(sparseRIV *inputs, int RIVCount){
} }
float magnitude = sqrt(temp); float magnitude = sqrt(temp);
inputs[i].magnitude = magnitude; inputs[i].magnitude = magnitude;
//printf("magnitude = %f, \n", magnitude);
}
}
int* mapS2D(int* destination, sparseRIV input){
memset(destination, 0, RIVKey.RIVsize*sizeof(int));
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
while(locations_slider<locations_stop){
destination[*locations_slider] = *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, int valueCount){
int *destination = (int*)calloc(RIVKey.RIVsize,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
int value = 1;
while(locations_slider<locations_stop){
destination[*locations_slider] +=value;
locations_slider++;
value = (value == 1)? -1: 1;
}
return destination;
}
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
int* locations = RIVKey.h_tempBlock;
int* values = RIVKey.h_tempBlock+RIVKey.RIVsize;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVKey.RIVsize; i++){
if(denseInput[i]){
*(locations_slider++) = i;
*(values_slider++) = denseInput[i];
output.count++;
}
} }
output.locations = (int*) malloc(output.count*sizeof(int));
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = (int*) malloc(output.count*sizeof(int));
memcpy(output.values, values, output.count*sizeof(int));
return output;
} }
void setKeyData(int RIVsize, int nonZeros, int blockSize){
RIVKey.RIVsize = RIVsize;
if(nonZeros%2){
printf("your nonZeros must be an even number");
nonZeros++;
printf(", changed to %d", nonZeros);
}
RIVKey.nonZeros = nonZeros;
RIVKey.masks = (long long int*)malloc(nonZeros*sizeof(long long int));
for(int i = 0; i<nonZeros; i++){
RIVKey.masks[i] = SEEDMASK>>(5*i);
}
RIVKey.h_tempBlock = (int*)malloc(blockSize*sizeof(int));
RIVKey.h_stagingBlock = (int*)malloc(blockSize*sizeof(int));
RIVKey.h_staging_slider = RIVKey.h_stagingBlock;
RIVKey.thing = 0;
}
void makeSeeds(unsigned char* word, int **seeds, int *seedCount){
int i=0;
int seedbase = 0;
while(*word){
seedbase += (*(word))<<(i*5);
word++;
i++;
}
int *seedTrack = (*seeds)+*seedCount;
for(i =0 ; i<RIVKey.nonZeros; i++){
*seedTrack = (seedbase>>i)+(3*i);
seedTrack++;
}
*seedCount+=RIVKey.nonZeros;
return;
}
int* makeSparseLocations(int* seeds, int seedCount){
int *locations = RIVKey.h_tempBlock;
int *locations_slider = locations;
int *seeds_stop = seeds+seedCount;
long long int *mask = RIVKey.masks;
long long int *mask_stop = mask+RIVKey.nonZeros;
while(seeds<seeds_stop){
*locations_slider =(((*seeds)^(*mask)) & 2147483647) %(RIVKey.RIVsize);
mask++;
locations_slider++;
seeds++;
if(!(mask<mask_stop)) mask-=RIVKey.nonZeros;
}
return locations;
}
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word){ unsigned char *sscanAdvance(unsigned char **string, unsigned char *word){
unsigned char *word_slider = word; unsigned char *word_slider = word;
...@@ -312,25 +212,26 @@ unsigned char *sscanAdvance(unsigned char **string, unsigned char *word){ ...@@ -312,25 +212,26 @@ unsigned char *sscanAdvance(unsigned char **string, unsigned char *word){
return word; return word;
} }
sparseRIV text2L2(unsigned char *text){ sparseRIV text2L2(unsigned char *text){
unsigned char word[2000] = {0}; unsigned char word[2000] = {0};
int *seeds = RIVKey.h_tempBlock; int* locations = RIVKey.h_tempBlock;
unsigned char *text_slider = text; unsigned char *text_slider = text;
int seedCount = 0; int locationCount = 0;
while(*text_slider){ while(*text_slider){
sscanAdvance(&text_slider, word); sscanAdvance(&text_slider, word);
if(word[0]){ if(word[0]){
makeSeeds(word, &seeds, &seedCount); makeSparseLocations(word, locations, locationCount);
locationCount+=RIVKey.nonZeros;
} }
} }
int *locations = makeSparseLocations(seeds, seedCount);
int *L2dense; int *L2dense;
L2dense = mapI2D(locations, seedCount); L2dense = mapI2D(locations, locationCount);
sparseRIV output = consolidateD2S(L2dense); sparseRIV output = consolidateD2S(L2dense);
free(L2dense); free(L2dense);
...@@ -358,61 +259,69 @@ int isWordClean(char* word){ ...@@ -358,61 +259,69 @@ int isWordClean(char* word){
return 1; return 1;
} }
denseRIV lexPull(int *valuesOut, char* word){ denseRIV lexPull(char* word){
#if CACHESIZE > 0
int hash = cacheHash(word);
if(!strcmp(word, RIVKey.RIVCache[hash].name)){
return RIVKey.RIVCache[hash];
}
#endif /* CACHESIZE > 0 */
denseRIV output; denseRIV output;
output.values = valuesOut;
output.values = (int*)calloc(RIVKey.RIVsize+1, sizeof(int));
output.frequency = output.values+RIVKey.RIVsize;
char pathString[200]; char pathString[200];
FILE *lexWord;
sprintf(pathString, "lexicon/%s", word); sprintf(pathString, "lexicon/%s", word);
lexWord = fopen(pathString, "r+"); FILE *lexWord = fopen(pathString, "rb");
strcpy(output.name, word); strcpy(output.name, word);
if(lexWord){ if(lexWord){
fscanf(lexWord, "%d,%f", &output.frequency, &output.magnitude); fread(output.frequency, 1, sizeof(int), lexWord);
int* values_slider = valuesOut; fread(&(output.magnitude), 1, sizeof(int), lexWord);
int* values_stop = valuesOut+RIVKey.RIVsize; fread(output.values, RIVKey.RIVsize, sizeof(int), lexWord);
while(values_slider<values_stop){
fscanf(lexWord, ",%d", values_slider);
values_slider++;
}
fclose(lexWord); fclose(lexWord);
}else{ }else{
output.frequency = 0; *(output.frequency) = 0;
output.magnitude = 0; output.magnitude = 0;
memset(valuesOut, 0, RIVKey.RIVsize*sizeof(int));
} }
return output; return output;
} }
int lexPush(denseRIV RIVout){ int lexPush(denseRIV RIVout){
#if CACHESIZE == 0
char pathString[1000] = {0}; fLexPush(RIVout);
strcpy(pathString, "lexicon"); return 0;
strcat(pathString, "/"); #else /*CACHESIZE != 0 */
strcat(pathString, RIVout.name); srand(wordtoSeed((unsigned char*)RIVout.name));
//printf("%s\n", pathString); int hash = rand()%RIVKey.cacheSize;
FILE *lexWord = fopen(pathString, "w+"); if(!strcmp(RIVout.name, RIVKey.RIVCache[hash].name)) return 0;
if(!lexWord){ if(!RIVKey.RIVCache[hash].frequency){
lexWord = fopen(pathString, "w+"); RIVKey.RIVCache[hash] = RIVout;
if(!lexWord){ return 0;
printf("fucked it up big time bro, %s\n", pathString); }else if(*RIVout.frequency>*RIVKey.RIVCache[hash].frequency){
printf("%s\n", pathString); int diag = fLexPush(RIVKey.RIVCache[hash]);
return 1; RIVKey.RIVCache[hash] = RIVout;
} return diag;
} }else{
fLexPush(RIVout);
//printf( "%f",RIVout.magnitude);
fprintf(lexWord, "%d,%f", RIVout.frequency, RIVout.magnitude);
int *values_slider = RIVout.values;
int *values_stop = RIVout.values+RIVKey.RIVsize;
while(values_slider<values_stop){
fprintf(lexWord, ",%d", *(values_slider));
values_slider++;
} }
fclose(lexWord);
return 0; return 0;
#endif /*CACHESIZE == 0 */
}
int cacheHash (char* word){
int i=0;
int seed = 0;
while(*word){
seed += (*(word))<<(i*5);
word++;
i++;
}
srand(seed);
return rand()%RIVKey.cacheSize;
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment