Commit 31b0c44b by Ethan

added more comment

parent 7c37cc43
Showing with 64 additions and 36 deletions
...@@ -32,19 +32,26 @@ sparseRIV fileToL2(FILE *input); ...@@ -32,19 +32,26 @@ sparseRIV fileToL2(FILE *input);
* this is important if you will be lexPush-ing those words later * this is important if you will be lexPush-ing those words later
*/ */
sparseRIV fileToL2Clean(FILE *data); sparseRIV fileToL2Clean(FILE *data);
/* cosine determines the "similarity" between two RIVs. */
void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, int (*action)(float cosine, sparseRIV base, sparseRIV multiplier)); void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, int (*action)(float cosine, sparseRIV base, sparseRIV multiplier));
/* magnitudes will be used later in cosine comparison */
void getMagnitudes(sparseRIV *inputs, size_t RIVCount); void getMagnitudes(sparseRIV *inputs, size_t RIVCount);
sparseRIV text2L2(unsigned char *text);//unused sparseRIV text2L2(unsigned char *text);//unused
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);//unused unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);//unused except in text2l2
sparseRIV fileToL2(FILE *data){ sparseRIV fileToL2(FILE *data){
unsigned int blockSize; unsigned int blockSize;
unsigned char word[100] = {0}; unsigned char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock; int *locations = RIVKey.h_tempBlock;
int locationCount = 0; int locationCount = 0;
while(fscanf(data, "%99s", word)){ while(fscanf(data, "%99s", word)){
if(feof(data)){ if(feof(data)){
...@@ -52,23 +59,29 @@ unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);//unuse ...@@ -52,23 +59,29 @@ unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);//unuse
} }
if(!(*word)){ if(!(*word)){
break; break;
} }
blockSize = locationCount+RIVKey.nonZeros; blockSize = locationCount+RIVKey.nonZeros;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){ if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int)); RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock; locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=RIVKey.nonZeros; RIVKey.tempSize+=RIVKey.nonZeros;
} }
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount); makeSparseLocations(word, locations, locationCount);
locationCount++; locationCount+= RIVKey.nonZeros;
} }
int *L2dense; int *L2dense;
/* in the next two steps, an implicit RIV is converted to a sparseRIV */
L2dense = mapI2D(locations, locationCount); L2dense = mapI2D(locations, locationCount);
sparseRIV output = consolidateD2S(L2dense);
sparseRIV output = consolidateD2S(L2dense);
free(L2dense); free(L2dense);
/* frequency records the number of words in this file */
output.frequency = locationCount/RIVKey.nonZeros; output.frequency = locationCount/RIVKey.nonZeros;
output.boolean = 1; output.boolean = 1;
return output; return output;
...@@ -92,6 +105,10 @@ sparseRIV fileToL2Clean(FILE *data){ ...@@ -92,6 +105,10 @@ sparseRIV fileToL2Clean(FILE *data){
if(!(*word)){ if(!(*word)){
break; break;
} }
/* if the word is not clean, skip it */
if(!isWordClean((char*)word)){
continue;
}
blockSize = locationCount+RIVKey.nonZeros; blockSize = locationCount+RIVKey.nonZeros;
if(blockSize>RIVKey.tempSize){ if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int)); RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
...@@ -113,58 +130,65 @@ sparseRIV fileToL2Clean(FILE *data){ ...@@ -113,58 +130,65 @@ sparseRIV fileToL2Clean(FILE *data){
output.boolean = 1; output.boolean = 1;
return output; return output;
} }
void cosineCompareUnbound(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, float threshold){ void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, int (*action)(float cosine, sparseRIV base, sparseRIV multiplier)){
int *baseDenseRIV = RIVKey.h_tempBlock; int *baseDenseRIV = RIVKey.h_tempBlock;
mapS2D(baseDenseRIV, baseRIV); mapS2D(baseDenseRIV, baseRIV);
float cosSim; float cosSim;
sparseRIV *multipliersStop = multipliers+multiplierCount; sparseRIV *multipliersStop = multipliers+multiplierCount;
/* if two vectors are too different in size, we can ignore the risk of similarity */
float minsize = baseRIV.magnitude * .85;
float maxsize = baseRIV.magnitude * 1.15;
int dot = 0;
int *values;
int *locations;
int *locations_Stop;
/* check the baseRIV against each multiplier */
while(multipliers<multipliersStop){ while(multipliers<multipliersStop){
if((*multipliers).boolean){ /* skip a pair if the multiplier has already been culled, or if
int dot = 0; * the size difference is too great */
int *values = (*multipliers).values; if(((*multipliers).boolean)
int *locations = (*multipliers).locations; && (((*multipliers).magnitude < maxsize)
int *locations_Stop = locations+(*multipliers).count; && ((*multipliers).magnitude > minsize))){
dot = 0;
values = (*multipliers).values;
locations = (*multipliers).locations;
locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){ while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine */
dot += (*values)*(*(baseDenseRIV+(*locations))); dot += (*values)*(*(baseDenseRIV+(*locations)));
locations++; locations++;
values++; values++;
} }
/* magnitudes had better already be calculated at this point*/
cosSim= dot/((baseRIV.magnitude)*((*multipliers).magnitude)); cosSim= dot/((baseRIV.magnitude)*((*multipliers).magnitude));
if(cosSim>=threshold){
printf("%s\t%s\n%f\n", (*multipliers).name, baseRIV.name, cosSim); /* perform the action defined by the acction function */
(*multipliers).boolean = 0; action(cosSim, baseRIV, (*multipliers));
RIVKey.thing ++;
scanf("%d", &RIVKey.thing);
}
} }
multipliers++; multipliers++;
} }
} }
void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, int (*action)(float cosine, sparseRIV base, sparseRIV multiplier)){ /* unbound works without skipping on size */
void cosineCompareUnbound(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount, float threshold){
int *baseDenseRIV = RIVKey.h_tempBlock; int *baseDenseRIV = RIVKey.h_tempBlock;
mapS2D(baseDenseRIV, baseRIV); mapS2D(baseDenseRIV, baseRIV);
float cosSim; float cosSim;
sparseRIV *multipliersStop = multipliers+multiplierCount; sparseRIV *multipliersStop = multipliers+multiplierCount;
float minsize = baseRIV.magnitude * .85;
float maxsize = baseRIV.magnitude * 1.15;
int dot = 0;
int *values;
int *locations;
int *locations_Stop;
while(multipliers<multipliersStop){ while(multipliers<multipliersStop){
if(((*multipliers).boolean) if((*multipliers).boolean){
&& (((*multipliers).magnitude < maxsize) int dot = 0;
&& ((*multipliers).magnitude > minsize))){ int *values = (*multipliers).values;
dot = 0; int *locations = (*multipliers).locations;
values = (*multipliers).values; int *locations_Stop = locations+(*multipliers).count;
locations = (*multipliers).locations;
locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){ while(locations<locations_Stop){
...@@ -173,14 +197,18 @@ void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierC ...@@ -173,14 +197,18 @@ void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierC
values++; values++;
} }
cosSim= dot/((baseRIV.magnitude)*((*multipliers).magnitude)); cosSim= dot/((baseRIV.magnitude)*((*multipliers).magnitude));
if(cosSim>=threshold){
action(cosSim, baseRIV, (*multipliers)); printf("%s\t%s\n%f\n", (*multipliers).name, baseRIV.name, cosSim);
(*multipliers).boolean = 0;
RIVKey.thing ++;
scanf("%d", &RIVKey.thing);
}
} }
multipliers++; multipliers++;
} }
} }
void getMagnitudes(sparseRIV *inputs, size_t RIVCount){ void getMagnitudes(sparseRIV *inputs, size_t RIVCount){
for(int i=0; i<RIVCount; i++){ for(int i=0; i<RIVCount; i++){
unsigned int temp = 0; unsigned int temp = 0;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment