Commit 31b0c44b by Ethan

added more comment

parent 7c37cc43
Showing with 64 additions and 36 deletions
......@@ -32,19 +32,26 @@ sparseRIV fileToL2(FILE *input);
* this is important if you will be lexPush-ing those words later
*/
sparseRIV fileToL2Clean(FILE *data);
/* cosine determines the "similarity" between two RIVs. */
void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, int (*action)(float cosine, sparseRIV base, sparseRIV multiplier));
/* magnitudes will be used later in cosine comparison */
void getMagnitudes(sparseRIV *inputs, size_t RIVCount);
sparseRIV text2L2(unsigned char *text);//unused
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);//unused
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);//unused except in text2l2
sparseRIV fileToL2(FILE *data){
unsigned int blockSize;
unsigned char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
......@@ -52,23 +59,29 @@ unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);//unuse
}
if(!(*word)){
break;
}
}
blockSize = locationCount+RIVKey.nonZeros;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=RIVKey.nonZeros;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount);
locationCount++;
locationCount+= RIVKey.nonZeros;
}
int *L2dense;
/* in the next two steps, an implicit RIV is converted to a sparseRIV */
L2dense = mapI2D(locations, locationCount);
sparseRIV output = consolidateD2S(L2dense);
sparseRIV output = consolidateD2S(L2dense);
free(L2dense);
/* frequency records the number of words in this file */
output.frequency = locationCount/RIVKey.nonZeros;
output.boolean = 1;
return output;
......@@ -92,6 +105,10 @@ sparseRIV fileToL2Clean(FILE *data){
if(!(*word)){
break;
}
/* if the word is not clean, skip it */
if(!isWordClean((char*)word)){
continue;
}
blockSize = locationCount+RIVKey.nonZeros;
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
......@@ -113,58 +130,65 @@ sparseRIV fileToL2Clean(FILE *data){
output.boolean = 1;
return output;
}
void cosineCompareUnbound(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, float threshold){
void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, int (*action)(float cosine, sparseRIV base, sparseRIV multiplier)){
int *baseDenseRIV = RIVKey.h_tempBlock;
mapS2D(baseDenseRIV, baseRIV);
float cosSim;
sparseRIV *multipliersStop = multipliers+multiplierCount;
/* if two vectors are too different in size, we can ignore the risk of similarity */
float minsize = baseRIV.magnitude * .85;
float maxsize = baseRIV.magnitude * 1.15;
int dot = 0;
int *values;
int *locations;
int *locations_Stop;
/* check the baseRIV against each multiplier */
while(multipliers<multipliersStop){
if((*multipliers).boolean){
int dot = 0;
int *values = (*multipliers).values;
int *locations = (*multipliers).locations;
int *locations_Stop = locations+(*multipliers).count;
/* skip a pair if the multiplier has already been culled, or if
* the size difference is too great */
if(((*multipliers).boolean)
&& (((*multipliers).magnitude < maxsize)
&& ((*multipliers).magnitude > minsize))){
dot = 0;
values = (*multipliers).values;
locations = (*multipliers).locations;
locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine */
dot += (*values)*(*(baseDenseRIV+(*locations)));
locations++;
values++;
}
/* magnitudes had better already be calculated at this point*/
cosSim= dot/((baseRIV.magnitude)*((*multipliers).magnitude));
if(cosSim>=threshold){
printf("%s\t%s\n%f\n", (*multipliers).name, baseRIV.name, cosSim);
(*multipliers).boolean = 0;
RIVKey.thing ++;
scanf("%d", &RIVKey.thing);
}
/* perform the action defined by the acction function */
action(cosSim, baseRIV, (*multipliers));
}
multipliers++;
}
}
void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, int (*action)(float cosine, sparseRIV base, sparseRIV multiplier)){
/* unbound works without skipping on size */
void cosineCompareUnbound(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, float threshold){
int *baseDenseRIV = RIVKey.h_tempBlock;
mapS2D(baseDenseRIV, baseRIV);
float cosSim;
sparseRIV *multipliersStop = multipliers+multiplierCount;
float minsize = baseRIV.magnitude * .85;
float maxsize = baseRIV.magnitude * 1.15;
int dot = 0;
int *values;
int *locations;
int *locations_Stop;
while(multipliers<multipliersStop){
if(((*multipliers).boolean)
&& (((*multipliers).magnitude < maxsize)
&& ((*multipliers).magnitude > minsize))){
dot = 0;
values = (*multipliers).values;
locations = (*multipliers).locations;
locations_Stop = locations+(*multipliers).count;
if((*multipliers).boolean){
int dot = 0;
int *values = (*multipliers).values;
int *locations = (*multipliers).locations;
int *locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){
......@@ -173,14 +197,18 @@ void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierC
values++;
}
cosSim= dot/((baseRIV.magnitude)*((*multipliers).magnitude));
action(cosSim, baseRIV, (*multipliers));
if(cosSim>=threshold){
printf("%s\t%s\n%f\n", (*multipliers).name, baseRIV.name, cosSim);
(*multipliers).boolean = 0;
RIVKey.thing ++;
scanf("%d", &RIVKey.thing);
}
}
multipliers++;
}
}
void getMagnitudes(sparseRIV *inputs, size_t RIVCount){
for(int i=0; i<RIVCount; i++){
unsigned int temp = 0;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment