secured push against cache

ad4b27c9 · etcart · 9fd65b3a · ad4b27c9 · ad4b27c9 · 9fd65b3a
Commit ad4b27c9 authored Apr 25, 2018 by etcart
Showing with 515 additions and 591 deletions
RIVLower.h
RIVLower.h.gch
RIVLower.h.save
RIVclasses
RIVclasses.c
RIVclasses.o
RIVconsolidate.c
RIVcull
RIVcull.c
RIVgraphout
RIVgraphout.c
RIVgraphout.o
RIVlexicon.h
RIVlexicon.h.gch
RIVread
RIVread.c
RIVread.o
RIVtools.h.gch
graphdata.txt
output.txt
--- a/RIVLower.h
+++ b/RIVLower.h
@@ -7,6 +7,7 @@
 #include <unistd.h>
 #include <sys/stat.h>
 #include "RIVaccessories.h"
+#include "assert.h"
 /* RIVSIZE macro defines the dimensionality off the RIVs we will use
 * 25000 is the standard, but can be redefined specifically
 */
@@ -14,8 +15,8 @@
 #define RIVSIZE 25000
 #endif
-#if RIVSIZE<0
+#if RIVSIZE<4
-#error "RIVSIZE must be a positive number (preferably a large positive)"
+#error "RIVSIZE must be a positive number, greater than 4 (preferably a large positive)"
 #endif
 /* NONZeros macro defines the number of non-zero values that will be generated
@@ -36,7 +37,7 @@
 * that do not use lexpull/push
 */
 #ifndef CACHESIZE
-#define CACHESIZE 5000
+#define CACHESIZE 10000
 #endif
 #if CACHESIZE<0
@@ -57,10 +58,10 @@ typedef struct{
 	char name[100];
 	int *values;
 	int *locations;
-	size_t count;
+	int count;
-	float magnitude;
-	int contextSize;
 	int frequency;
+	int contextSize;
+	float magnitude;
 }sparseRIV;
 /* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
 * this is rarely the case, but its primary use is for performing vector
@@ -68,11 +69,11 @@ typedef struct{
 * performed between sparse and dense (hetero-arithmetic)
 */
 typedef struct{
-	int cached;
 	char name[100];
+	int cached;
 	int frequency;
-	float magnitude;
 	int contextSize;
+	float magnitude;
 	int values[RIVSIZE];
 }denseRIV;
@@ -99,13 +100,13 @@ sparseRIV consolidateD2S(int *denseInput);  //#TODO fix int*/denseRIV confusion
 * this produces an "implicit" RIV which can be used with the mapI2D function
 * to create a denseRIV.
 */
-void makeSparseLocations(char* word,  int *seeds, size_t seedCount);
+void makeSparseLocations(char* word,  int *seeds, int seedCount);
 /* mapI2D maps an "implicit RIV" that is, an array of index values, 
 * arranged by chronological order of generation (as per makesparseLocations)
 * it assigns, in the process of mapping, values according to ordering
 */
-int* mapI2D(int *locations, size_t seedCount);
+int* mapI2D(int *locations, int seedCount);
 /* highly optimized method for adding vectors.  there is no method 
 * included for adding D2D or S2S, as this system is faster-enough
@@ -121,7 +122,7 @@ int cacheDump();
 /* adds all elements of an implicit RIV (a sparseRIV represented without values)
 * to a denseRIV.  used by the file2L2 functions in aggregating a document vector
 */
-int* addI2D(int* destination, int* locations, size_t seedCount);
+int* addI2D(int* destination, int* locations, int seedCount);
 /*subtracts a words vector from its own context.  regularly used in lex building
 */
@@ -136,6 +137,7 @@ int* addS2D(int* destination, sparseRIV input){// #TODO fix destination paramete
 	/* apply values at an index based on locations */
 	while(locations_slider<locations_stop){
 		destination[*locations_slider] += *values_slider;
 		locations_slider++;
 		values_slider++;
@@ -144,7 +146,7 @@ int* addS2D(int* destination, sparseRIV input){// #TODO fix destination paramete
 	return destination;
 }
-int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
+int* mapI2D(int *locations, int valueCount){// #TODO fix destination parameter vs calloc of destination
 	int *destination = (int*)calloc(RIVSIZE,sizeof(int));
 	int *locations_slider = locations;
 	int *locations_stop = locations_slider+valueCount;
@@ -160,7 +162,7 @@ int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination paramete
 	return destination;
 }
-int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
+int* addI2D(int* destination, int *locations, int valueCount){// #TODO fix destination parameter vs calloc of destination
 	int *locations_slider = locations;
 	int *locations_stop = locations_slider+valueCount;
@@ -203,6 +205,7 @@ sparseRIV consolidateD2S(int *denseInput){
 		}
 	}
 	/* a slot is opened for the locations/values pair */
 	output.locations = (int*) malloc(output.count*2*sizeof(int));
 	if(!output.locations){
 		printf("memory allocation failed"); //*TODO enable fail point knowledge and security
@@ -220,7 +223,7 @@ sparseRIV consolidateD2S(int *denseInput){
-void makeSparseLocations(char* word,  int *locations, size_t count){
+void makeSparseLocations(char* word,  int *locations, int count){
 	locations+=count;
 	srand(wordtoSeed(word));
 	int *locations_stop = locations+NONZEROS;

--- a/RIVLower.h.gch
+++ b/RIVLower.h.gch
--- a/RIVLower.h.save
+++ b/RIVLower.h.save
-#ifndef RIVLOWER_H_
-#define RIVLOWER_H_
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <signal.h>
-#include <unistd.h>
-#include <math.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-/* RIVSIZE macro defines the dimensionality off the RIVs we will use
- * 25000 is the standard, but can be redefined specifically
- */
-#ifndef RIVSIZE
-#define RIVSIZE 25000
-#endif
-#if RIVSIZE<0
-#error "RIVSIZE must be a positive number (preferably a large positive)"
-#endif
-/* NONZeros macro defines the number of non-zero values that will be generated
- * for any level one (barcode) RIV.  2 is simple and lightweight to begin
- */
-#ifndef NONZEROS
-#define NONZEROS 2
-#endif
-#if NONZEROS%2 || NONZEROS<1
-#error "NONZEROS must be an even, greater than 0 number"
-#endif
-/* CACHESIZE macro defines the number of RIVs the system will cache.
- * a larger cache means more memory consumption, but will also be significantly
- * faster in aggregation and reading applications. doesn't affect systems
- * that do not use lexpull/push
- */
-#ifndef CACHESIZE
-#define CACHESIZE 20
-#endif
-#if CACHESIZE<0
-#error "CACHESIZE cannot be a negative number"
-#endif
-/* the size of the tempBlock used in consolidation and implicit RIVs */
-#define TEMPSIZE 3*RIVSIZE
-/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
- * as this is often an ideal case, it is adviseable as the default 
- * unless we are doing long term RIV aggregation.
- * specifically, a sparseRIV contains a pair of arrays, 
- * containing locations and values, where pairs are found in like array 
- * indices.
- */
-typedef struct{
-	char name[100];
-	int *values;
-	int *locations;
-	size_t count;
-	int frequency;
-	double magnitude;
-	int boolean;
-	int contextSize;
-}sparseRIV;
-/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
- * this is rarely the case, but its primary use is for performing vector
- * math, as comparisons and arithmetic between vectors are ideally 
- * performed between sparse and dense (hetero-arithmetic)
- */
-typedef struct{
-	char name[100];
-	int* values;
-	int* frequency;
-	double magnitude;
-	int cached;
-	int *contextSize;
-}denseRIV;
-/*RIVKey, holds global variables used under the hood, primarily for the lexicon
- * it also holds a "temp block" that will be used by the dense to sparse 
- * conversion and implicit RIV aggregation 
-*/
-struct RIVData{
-	int h_tempBlock[TEMPSIZE];
-	int tempSize;
-	char lexName[255];
-	denseRIV RIVCache[CACHESIZE];
-}static RIVKey;
-/* lexOpen is called to "open the lexicon", setting up for later calls to
- * lexPush and lexPull. if the lexicon has not been opened before calls
- * to these functions, their behavior can be unpredictable, most likely crashing
- */
-void lexOpen();
-/* lexClose should always be called after the last lex push or lex pull call
- * if the lexicon is left open, some vector data may be lost due to 
- * un-flushed RIV cache
- */
-void lexClose();
-/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
- * all 0s removed. it does not automatically carry metadata, which must be assigned
- * to a denseRIV after the fact.  often denseRIVs are only temporary, and don't
- * contain any metadata
- */
-sparseRIV consolidateD2S(int *denseInput);  //#TODO fix int*/denseRIV confusion
-/* makeSparseLocations must be called repeatedly in the processing of a 
- * file to produce a series of locations from the words of the file
- * this produces an "implicit" RIV which can be used with the mapI2D function
- * to create a denseRIV.
- */
-void makeSparseLocations(unsigned char* word,  int *seeds, size_t seedCount);
-/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
- * saving it for long-term aggregation.  function is called by "lexPush",
- * which is what users should actually use.  lexPush, unlike fLexPush,
- * has cache logic under the hood for speed and harddrive optimization
- */
-int fLexPush(denseRIV RIVout);
-/* flexPull pulls data directly from a file and converts it (if necessary)
- * to a denseRIV.  function is called by "lexPull" which is what users 
- * should actually use.  lexPull, unlike FlexPull, has cache logic under
- * the hood for speed and harddrive optimization 
- */
-denseRIV fLexPull(FILE* lexWord);
-/* creates a standard seed from the characters in a word, hopefully unique */
-int wordtoSeed(unsigned char* word);
-/* mapI2D maps an "implicit RIV" that is, an array of index values, 
- * arranged by chronological order of generation (as per makesparseLocations)
- * it assigns, in the process of mapping, values according to ordering
- */
-int* mapI2D(int *locations, size_t seedCount);
-/* highly optimized method for adding vectors.  there is no method 
- * included for adding D2D or S2S, as this system is faster-enough
- * to be more than worth using
- */
-int* addS2D(int* destination, sparseRIV input);
-/*
-sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
-sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount);
-* consolidate I2S is temporarily deprecated.  may be brought back.
-* in tandem they are much faster, but less careful with RAM */
-/* caheDump flushes the RIV cache out to relevant files, backing up all 
- * data.  this is called by the lexClose and signalSecure functions
- */
-int cacheDump();
-/* adds all elements of an implicit RIV (a sparseRIV represented without values)
- * to a denseRIV.  used by the file2L2 functions in aggregating a document vector
- */
-int* addI2D(int* destination, int* locations, size_t seedCount);
-/* allocates a denseRIV filled with 0s
- */
-denseRIV denseAllocate();
-/* redefines signal behavior to protect cached data against seg-faults etc*/
-void signalSecure(int signum);
-/* begin definitions */
-int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
-	int *locations_slider = input.locations;	int *values_slider = input.values;
-	int *locations_stop = locations_slider+input.count;
-	/* apply values at an index based on locations */
-	while(locations_slider<locations_stop){
-		destination[*locations_slider] += *values_slider;
-		locations_slider++;
-		values_slider++;
-	}
-	return destination;
-}
-int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
-	int *destination = (int*)calloc(RIVSIZE,sizeof(int));
-	int *locations_slider = locations;
-	int *locations_stop = locations_slider+valueCount;
-	/*apply values +1 or -1 at an index based on locations */
-	while(locations_slider<locations_stop){
-		destination[*locations_slider] +=1;
-		locations_slider++;
-		destination[*locations_slider] -= 1;
-		locations_slider++;
-	}
-	return destination;
-}
-int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
-	int *locations_slider = locations;
-	int *locations_stop = locations_slider+valueCount;
-	/*apply values +1 or -1 at an index based on locations */
-	while(locations_slider<locations_stop){
-		destination[*locations_slider] +=1;
-		locations_slider++;
-		destination[*locations_slider] -= 1;
-		locations_slider++;
-	}
-	return destination;
-}
-/*
-sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
-	int *denseTemp = mapI2D(implicit, valueCount);
-	sparseRIV sparseOut = consolidateD2S(denseTemp);
-	free(denseTemp);
-	return sparseOut;
-}
-sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
-	sparseRIV sparseOut;
-	int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
-	int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
-	sparseOut.count = 0;
-	int add = 1;
-	int found;
-	for(int i=0; i<valueCount; i++){
-		found = 0;
-		for(int j=0; j<sparseOut.count; j++){
-			if(implicit[i] == locationsTemp[j]){
-				valuesTemp[i] += add;
-				add *= -1;
-				found = 1;
-			}
-		}
-		if(!found){
-			locationsTemp[sparseOut.count] = implicit[i];
-			valuesTemp[sparseOut.count] = add;
-			sparseOut.count++;
-			add*= -1;
-		}
-	}
-	sparseOut.locations = (int*)malloc(2*sparseOut.count*sizeof(int));
-	sparseOut.values = sparseOut.locations+sparseOut.count;
-	memcpy(sparseOut.locations, locationsTemp, sparseOut.count*sizeof(int));
-	memcpy(sparseOut.values, valuesTemp, sparseOut.count*sizeof(int));
-	return sparseOut;
-}*/
-sparseRIV consolidateD2S(int *denseInput){
-	sparseRIV output;
-	output.count = 0;
-	/* key/value pairs will be loaded to a worst-case sized temporary slot */
-	int* locations = RIVKey.h_tempBlock+RIVSIZE;
-	int* values = locations+RIVSIZE;
-	int* locations_slider = locations;
-	int* values_slider = values;
-	for(int i=0; i<RIVSIZE; i++){
-		/* act only on non-zeros */
-		if(denseInput[i]){
-			/* assign index to locations */
-			*(locations_slider++) = i;
-			/* assign value to values */
-			*(values_slider++) = denseInput[i];
-			/* track size of forming sparseRIV */
-			output.count++;
-		}
-	}
-	/* a slot is opened for the locations/values pair */
-	output.locations = (int*) malloc(output.count*2*sizeof(int));
-	if(!output.locations){
-		printf("memory allocation failed"); //*TODO enable fail point knowledge and security
-	}
-	/* copy locations values into opened slot */
-	memcpy(output.locations, locations, output.count*sizeof(int));
-	output.values = output.locations + output.count;
-	/* copy values into opened slot */
-	memcpy(output.values, values, output.count*sizeof(int));
-	return output;
-}
-void lexOpen(char* lexName){
-	/* RIVKey.I2SThreshold = sqrt(RIVSIZE);*/ //deprecate?
-	struct stat st;
-	if (stat(lexName, &st) == -1) {
-		mkdir(lexName, 0777);
-	}	
-	strcpy(RIVKey.lexName, lexName);
-	/* open a slot at least large enough for worst case handling of
-	 * sparse to dense conversion.  may be enlarged by filetoL2 functions */
-		signal(11, signalSecure);
-	/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
-	memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE);
-}
-void lexClose(){
-	if(cacheDump()){
-		puts("cache dump failed, some lexicon data was lost");
-	}
-}
-int wordtoSeed(unsigned char* word){
-	int i=0;
-	int seed = 0;
-	while(*word){
-		/* left-shift 5 each time *should* make seeds unique to words
-		 * this means letters are taken as characters counted in base 32, which
-		 * should be large enough to hold all english characters plus a few outliers
-		 * */
-		seed += (*(word))<<(i*5);
-		word++;
-		i++;
-	}
-	return seed;
-}
-void makeSparseLocations(unsigned char* word,  int *locations, size_t count){
-	locations+=count;
-	srand(wordtoSeed(word));
-	int *locations_stop = locations+NONZEROS;
-	while(locations<locations_stop){
-		/* unrolled for speed, guaranteed to be an even number of steps */
-		*locations = rand()%RIVSIZE;
-		locations++;
-		*locations = rand()%RIVSIZE;
-		locations++;
-	}
-	return;
-}
-int fLexPush(denseRIV RIVout){	
-	char pathString[200] = {0};
-	/* word data will be placed in a (new?) file under the lexicon directory
-	 * in a file named after the word itself */
-	sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
-	FILE *lexWord = fopen(pathString, "wb");
-	if(!lexWord){
-		printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
-		return 1;
-	}
-	sparseRIV temp = consolidateD2S(RIVout.values);
-	if(temp.count<(RIVSIZE/2)){
-		/* smaller stored as sparse vector */
-		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
-		fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
-		fwrite(RIVout.contextSize, 1, sizeof(int), lexWord);
-		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
-		fwrite(temp.locations, temp.count, sizeof(int), lexWord);
-		fwrite(temp.values, temp.count, sizeof(int), lexWord);
-	//	printf("%s, writing as sparse, frequency: %d", RIVout.name, *RIVout.frequency);
-	}else{
-		/* saturation is too high, better to store dense */
-		/* there's gotta be a better way to do this */
-		temp.count = 0;
-		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
-		fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
-		fwrite(RIVout.contextSize, 1, sizeof(int), lexWord);
-		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
-		fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
-	//	printf("%s, writing as dense, frequency: %d", RIVout.name, *RIVout.frequency);
-	}
-	fclose(lexWord);
-	free(RIVout.values);
-	free(temp.locations);
-	return 0;
-}
-denseRIV fLexPull(FILE* lexWord){
-	denseRIV output = denseAllocate();
-	size_t typeCheck;
-	int flag = 0;
-	/* get metadata for vector */
-	flag+= fread(&typeCheck, 1, sizeof(size_t), lexWord);
-	flag+= fread(output.frequency, 1, sizeof(int), lexWord);
-	flag+= fread(output.contextSize, 1, sizeof(int), lexWord);
-	flag+= fread(&(output.magnitude), 1, sizeof(float), lexWord);
-	/* first value stored is the value count if sparse, and 0 if dense */
-	if (typeCheck){
-		/* pull as sparseVector */
-		sparseRIV temp;
-		/* value was not 0, so it's the value count */
-		temp.count = typeCheck;
-		temp.locations = (int*)malloc(temp.count*2*sizeof(int));
-		temp.values = temp.locations+temp.count;
-		flag+= fread(temp.locations, temp.count, sizeof(int), lexWord);
-		flag+=fread(temp.values, temp.count, sizeof(int), lexWord);
-		addS2D(output.values, temp);
-		free(temp.locations);
-	}else{
-		/* typecheck is thrown away, just a flag in this case */
-		flag += fread(output.values, RIVSIZE, sizeof(int), lexWord);
-	}
-	output.cached = 0;
-	return output;
-}
-void signalSecure(int signum){
-  if(cacheDump()){
-	  puts("cache dump failed, some lexicon data lost");
-  }else{
-	puts("cache dumped successfully");
-  }
-  signal(signum, SIG_DFL);
-  exit(1);
-}
-int cacheDump(){
-	int flag = 0;
-	denseRIV* cache_slider = RIVKey.RIVCache;
-	denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE;
-	while(cache_slider<cache_stop){
-		if((*cache_slider).cached){
-			flag += fLexPush(*cache_slider);
-		}
-		else{
-		}
-		cache_slider++;
-	}
-	return flag;
-}
-denseRIV denseAllocate(){
-	/* allocates a 0 vector */
-	denseRIV output;
-	output.values = (int*)calloc(RIVSIZE+2, sizeof(int));
-	/* for compact memory use, frequency is placed immediately after values */
-	output.frequency = output.values+RIVSIZE;
-	output.contextSize = output.frequency+1;
-	output.magnitude = 0;
-	output.cached = 0;
-	return output;
-}
-/*TODO add a simplified free function*/
-#endif
--- a/RIVclasses
+++ b/RIVclasses
--- a/RIVclasses.c
+++ b/RIVclasses.c
+#include <stdio.h>
+#define CACHESIZE 0
+#define CACHEEXCLUSIVE 1
+#define RIVSIZE 50000
+#include "RIVtools.h"
+char* stem(char* word);
+int main(){
+	lexOpen("consolidatedLexicon50-8");
+	FILE* text = fopen("../books/pg56902.txt", "r");
+	if(!text){
+		puts("no file");
+		return 1;
+	}
+	denseRIV accumulate = {0};
+	sparseRIV temp;
+	char word[100];
+	while(fscanf(text, "%99s", word)){
+		if(feof(text)) break;
+		if(!*word) break;
+		if(stem(word)){
+			denseRIV* wordRIV = lexPull(word);
+			if(!wordRIV){
+				printf("%s, not in lexicon\n", word);
+				continue;
+			}else{
+				temp = consolidateD2S(wordRIV->values);
+				addS2D(accumulate.values, temp);
+				free(temp.locations);
+				free(wordRIV);
+			}
+		}else{
+			printf("%s, not in wordNet\n", word);
+		}
+	}
+	return 0;
+}
+char* stem(char* word){
+	char pathString[200];
+	int WNdata;
+	sprintf(pathString, "WN/%s", word);
+	FILE* WNfile = fopen(pathString, "r");
+	if(!WNfile) return NULL;
+	fscanf(WNfile, "%d", &WNdata);
+	if(!WNdata) return NULL;
+	if(WNdata == 1) return word;
+	if(WNdata == 2){
+		fscanf(WNfile, "%s", word);
+		fclose(WNfile);
+		sprintf(pathString, "WN/%s", word);
+		WNfile = fopen(pathString, "r");
+		if(!WNfile) return NULL;
+		fscanf(WNfile, "%*d%s", word);
+		return word;
+	}
+	return NULL;
+}
--- a/RIVclasses.o
+++ b/RIVclasses.o
--- a/RIVconsolidate.c
+++ b/RIVconsolidate.c
+#include <stdio.h>
+#define RIVSIZE 50000
+#define CACHESIZE 0
+#include "RIVtools.h"
+#include <dirent.h>
+int main(int argc, char* argv[]){
+	lexOpen(argv[1]);
+	denseRIV* intake;
+	sparseRIV examine;
+	static denseRIV *output[60000] = {0};
+	DIR *directory;
+    struct dirent *files = 0;
+	if(!(directory = opendir(argv[1]))){
+		printf("location not found, %s\n", argv[1]);
+		return 1;
+	}
+	int i=0;
+	int j=0;
+	while((files=readdir(directory))){
+		if(*(files->d_name) == '.') continue;
+		if(files->d_type == DT_DIR){
+			/* the lexicon should not have valid sub-directories */
+			continue;
+		}
+		j++;
+		intake = lexPull(files->d_name);
+		/* if the vector has been encountered more than MINSIZE times
+		 * then it should be statistically significant, and useful */
+		if(intake->contextSize<7000){
+			free(intake);
+			continue;
+		}
+		examine = normalize(*intake, 10000);
+		strcpy(examine.name, files->d_name);
+		printf("%d,%d,%lf,%d,%d\n", examine.frequency, examine.contextSize, examine.magnitude, i, j);
+		output[i] = calloc(1, sizeof(denseRIV));
+		addS2D(output[i]->values, examine);
+		output[i]->magnitude = examine.magnitude;
+		strcpy(output[i]->name, files->d_name);
+		output[i]->frequency = intake->frequency;
+		output[i]->contextSize = intake->contextSize;
+		free(intake);
+		free(examine.locations);
+		i++;
+	}
+	lexClose();
+	lexOpen("consolidatedLexicon50-8");
+	for(int j=0; j<i; j++){
+		lexPush(output[j]);
+	}
+	lexClose();
+	return 0;
+}
--- a/RIVcull
+++ b/RIVcull
--- a/RIVcull.c
+++ b/RIVcull.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <time.h>
+#include "RIVtools.h"
+#define THRESHOLD 0.70
+/* this program identifies all near-duplicates among the documents in the 
+ * chosen root directory, using RIV comparison */
+// fills the fileRIVs array with a vector for each file in the root directory
+void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
+int main(int argc, char *argv[]){
+	int fileCount = 0;
+	//initializes the fileRIVs array to be reallocced by later function
+	sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
+	char rootString[2000];
+	if(argc <2){ 
+		printf("give me a directory");
+		return 1;
+	}
+	strcpy(rootString, argv[1]);
+	strcat(rootString, "/");
+	//gather all vectors ino the fileRIVs array and count them in fileCount
+	directoryToL2s(rootString, &fileRIVs, &fileCount);
+	printf("fileCount: %d\n", fileCount);
+	//first calculate all magnitudes for later use
+	for(int i = 0; i < fileCount; i++){
+		fileRIVs[i].magnitude = getMagnitudeSparse(fileRIVs[i]);
+	}
+	clock_t begintotal = clock();
+	double cosine;
+	double minmag;
+	double maxmag;
+	//all cosines need a sparse-dense comparison.  so we will create a 
+	denseRIV baseDense;
+	for(int i = 0; i < fileCount; i++){
+		//0 out the denseVector, and map the next sparseVector to it
+		memset(&baseDense, 0, sizeof(denseRIV));
+		addS2D(baseDense.values, fileRIVs[i]);
+		//pass magnitude to the to the dense vector
+		baseDense.magnitude = fileRIVs[i].magnitude;
+		//if these two vectors are too different in size, we can know that they are not duplicates
+		minmag = baseDense.magnitude*.85;
+		maxmag  = baseDense.magnitude*1.15;
+		for(int j = 0; j < i; j++){
+			//if this vector is within magnitude threshold
+			if(fileRIVs[j].magnitude < maxmag 
+			&& fileRIVs[j].magnitude > minmag){
+				//identify the similarity of these two vectors
+				cosine = cosCompare(baseDense, fileRIVs[j]);
+				//if the two are similar enough to be flagged
+				if(cosine>THRESHOLD){
+					printf("%s\t%s\n%f\n", fileRIVs[i].name , fileRIVs[j].name, cosine);
+				}	
+			}
+		}
+	}
+	printf("fileCount: %d", fileCount);
+	free(fileRIVs);
+	clock_t endtotal = clock();
+	double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
+	printf("total time:%lf\n\n", time_spent);
+return 0;
+}
+//mostly a standard recursive Dirent-walk
+void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
+/* *** begin Dirent walk *** */
+	char pathString[2000];
+	DIR *directory;
+	struct dirent *files = 0;
+	if(!(directory = opendir(rootString))){
+		printf("location not found, %s\n", rootString);
+		return;
+	}
+	while((files=readdir(directory))){
+		if(!files->d_name[0]) break;
+		while(*(files->d_name)=='.'){
+			files = readdir(directory);
+		}
+		if(files->d_type == DT_DIR){
+			strcpy(pathString, rootString);
+			strcat(pathString, files->d_name);
+			strcat(pathString, "/");
+			directoryToL2s(pathString, fileRIVs, fileCount);
+			continue;
+		}
+		strcpy(pathString, rootString);
+		strcat(pathString, files->d_name);
+/* *** end dirent walk, begin meat of function  *** */
+		FILE *input = fopen(pathString, "r");
+		if(input){
+			*fileRIVs = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
+			(*fileRIVs)[*fileCount] = fileToL2(input);
+			strcpy((*fileRIVs)[*fileCount].name, pathString);
+			fclose(input);
+			 *fileCount += 1;
+		}
+	}
+}
--- a/RIVgraphout
+++ b/RIVgraphout
--- a/RIVgraphout.c
+++ b/RIVgraphout.c
 #include <stdio.h>
-#define RIVSIZE 25000
+#define RIVSIZE 50000
 #define CACHESIZE 0
 #include "RIVtools.h"
 #include <dirent.h>
@@ -7,8 +7,6 @@
 int main(int argc, char* argv[]){
 	lexOpen(argv[1]);
 	denseRIV* intake;
-	sparseRIV examine;
-	static denseRIV *output[60000] = {0};
 	DIR *directory;
    struct dirent *files = 0;
@@ -28,27 +26,15 @@ int main(int argc, char* argv[]){
 		intake = lexPull(files->d_name);
 		/* if the vector has been encountered more than MINSIZE times
 		 * then it should be statistically significant, and useful */
-		if(intake->contextSize<10000)continue;
-		examine = normalize(*intake, 500);
-		strcpy(examine.name, files->d_name);
-		printf("%d,%d,%lf,%s\n", examine.frequency, examine.contextSize, examine.magnitude, examine.name);
-		output[i] = calloc(1, sizeof(denseRIV));
-		addS2D(output[i]->values, examine);
-		output[i]->magnitude = examine.magnitude;
-		strcpy(output[i]->name, files->d_name);
-		output[i]->frequency = intake->frequency;
-		free(intake);
-		free(examine.locations);
-		i++;
-	}
-	lexClose();
-	/*lexOpen("consolidatedLexiconAggressive");
-	for(int j=0; j<i; j++){
-		lexPush(output[j]);
+		printf("%d,%d,%lf,%d,%s\n", intake->frequency, intake->contextSize, intake->magnitude, i, files->d_name);
+		free(intake);
+		i++;
 	}
-	lexClose();*/
+	lexClose();
 	return 0;
 }
--- a/RIVgraphout.o
+++ b/RIVgraphout.o
--- a/RIVlexicon.h
+++ b/RIVlexicon.h
@@ -3,6 +3,17 @@
 #include "RIVLower.h"
 #include "RIVaccessories.h"
+#include "assert.h"
+#ifndef CACHEEXCLUSIVE
+#define CACHEEXCLUSIVE 0
+#endif
+#define IODISPLACEMENT   (sizeof(((sparseRIV*)0)->count)\
+						+ sizeof(((sparseRIV*)0)->frequency)\
+						+ sizeof(((sparseRIV*)0)->contextSize)\
+						+ sizeof(((sparseRIV*)0)->magnitude))\
+						/ sizeof(int)
+int* IOstagingSlot = RIVKey.h_tempBlock+RIVSIZE; //#TODO format this better
 /* lexOpen is called to "open the lexicon", setting up for later calls to
 * lexPush and lexPull. if the lexicon has not been opened before calls
@@ -24,7 +35,12 @@ void lexClose();
 /* lexPush writes a denseRIV to the lexicon for permanent storage */
 int lexPush(denseRIV* RIVout);
+/* cacheCheckOnPush tests the state of this vector in our lexicon cache
+ * and returns 1 on "success" indicating cache storage and no need to push to file
+ * or returns 0 on "failure" indicating that the vector need be pushed to file 
+ */
 int cacheCheckOnPush(denseRIV* RIVout);
 /* lexPull reads a denseRIV from the lexicon, under "word"
 * if the file does not exist, it creates a 0 vector with the name of word
 * lexPull returns a denseRIV *pointer* because its data must be tracked 
@@ -32,6 +48,11 @@ int cacheCheckOnPush(denseRIV* RIVout);
 */
 denseRIV* lexPull(char* word);
+/* cacheCheckonPull checks if the word's vector is stored in cache,
+ * and returns a pointer to that vector on success
+ * or returns a NULL pointer if the word is not cached, indicating a need 
+ * to pull from file
+ */
 denseRIV* cacheCheckOnPull(char* word);
 /* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
@@ -51,6 +72,10 @@ denseRIV* fLexPull(FILE* lexWord);
 /* redefines signal behavior to protect cached data against seg-faults etc*/
 void signalSecure(int signum, siginfo_t *si, void* arg);
+/* used exclusively by flexpush to determine write-style (sparse or dense)
+ * and also formats the "IOstagingSlot" for fwrite as a single block if sparse
+ */
+int saturationForStaging(denseRIV* output);
 /* begin definitions */
 void lexOpen(char* lexName){
@@ -94,7 +119,8 @@ denseRIV* cacheCheckOnPull(char* word){
 }
 #endif
 denseRIV* lexPull(char* word){
-	denseRIV* output;
+	denseRIV* output = NULL;
 	#if CACHESIZE > 0
@@ -105,24 +131,30 @@ denseRIV* lexPull(char* word){
 	#endif /* CACHESIZE > 0 */
 	/* if not, attempt to pull the word data from lexicon file */
 	char pathString[200];
 	sprintf(pathString, "%s/%s", RIVKey.lexName, word);
 	FILE *lexWord = fopen(pathString, "rb");
 	/* if this lexicon file already exists */
 	if(lexWord){
 		/* pull data from file */
 		output = fLexPull(lexWord);
+		strcpy(output->name, word);
 		fclose(lexWord);
 	}else{
-		/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons? 
+		#if CACHEEXCLUSIVE == 0
+		/*if file does not exist, return a 0 vector (word is new to the lexicon */
 		output = calloc(1, sizeof(denseRIV));
+		strcpy(output->name, word);
+		#endif
+		/*if lexicon is set to exclusive, will return a NULL pointer instead of a 0 vector */
 	}
-	strcpy(output->name, word);
 	return output;
 }
 #if CACHESIZE > 0
@@ -159,87 +191,170 @@ int cacheCheckOnPush(denseRIV* RIVout){
 }
 #endif
 int lexPush(denseRIV* RIVout){
 	#if CACHESIZE > 0
+	/* check the cache to see if it belongs in cache */
 	if(cacheCheckOnPush(RIVout)){
+		/* if the cache check returns 1, it has been dealth with in cache */
 		return 0;
 	}
 	#endif /* CACHESIZE != 0 */
-	/* find the cache-slot where this word belongs */
+	/* push to the lexicon */
 	return fLexPush(RIVout);
+}
+int saturationForStaging(denseRIV* output){
+	/* key/value pairs will be loaded to a worst-case sized temporary slot */
+	int* count = IOstagingSlot;
+	*count = 0;
+	*(count+1) = output->frequency;
+	*(count+2) = output->contextSize;
+	*(float*)(count+3) = output->magnitude;
+	int* locations = IOstagingSlot+4;
+	int* values = IOstagingSlot-RIVSIZE;;
+	int* locations_slider = locations;
+	int* values_slider = values;
+	for(int i=0; i<RIVSIZE; i++){
+		/* act only on non-zeros */
+		if(output->values[i]){
+			/* assign index to locations */
+			*(locations_slider++) = i;
+			/* assign value to values */
+			*(values_slider++) = output->values[i];
+			/* track size of forming sparseRIV */
+			*count += 1;
+		}
+	}
+	/* copy values into slot immediately after locations */
+	memcpy(locations+*count, values, (*count)*sizeof(int));
+	return *count;
 }
 int fLexPush(denseRIV* output){	
 	char pathString[200] = {0};
-	denseRIV RIVout = *output;
 	/* word data will be placed in a (new?) file under the lexicon directory
 	 * in a file named after the word itself */
-	sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
+	sprintf(pathString, "%s/%s", RIVKey.lexName, output->name);
-	FILE *lexWord = fopen(pathString, "wb");
+	int saturation = saturationForStaging(output);
-	if(!lexWord){
-		printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
+	if( saturation < RIVSIZE/2){
-		return 1;
-	}
+		FILE *lexWord = fopen(pathString, "wb");
+		if(!lexWord){
-	sparseRIV temp = consolidateD2S(RIVout.values);
+			printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
-	if(temp.count<(RIVSIZE/2)){
+			return 1;
-		/* smaller stored as sparse vector */
+		}
+		fwrite(IOstagingSlot, (saturation*2)+4, sizeof(int), lexWord);
-		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
+		fclose(lexWord);
-		fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
-		fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
-		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
-		fwrite(temp.locations, temp.count, sizeof(int), lexWord);
-		fwrite(temp.values, temp.count, sizeof(int), lexWord);
 	}else{
-		/* saturation is too high, better to store dense */
+		output->cached = 0;
-		/* there's gotta be a better way to do this */
+		FILE *lexWord = fopen(pathString, "wb");
-		temp.count = 0;
+		if(!lexWord){
-		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
+			printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
-		fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
+			return 1;
-		fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
+		}
-		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
+		fwrite(&output->cached, sizeof(int), RIVSIZE+4, lexWord);
-		fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
+		fclose(lexWord);
 	}
+	/* older way of writing, kept while debugging new */
+	//~ if(temp.count<(RIVSIZE/2)){
+		//~ /* smaller stored as sparse vector */
+		//~ *writeStaging = temp.count;
+		//~ stagingSize = sizeof(temp.count);
+		//~ memcpy(writeStaging+stagingSize, &RIVout.frequency, sizeof(int)*3);
+		//~ stagingSize += sizeof(int)*3;
+		//~ memcpy(writeStaging+stagingSize, temp.locations, temp.count*2*sizeof(int));
+		//~ stagingSize += temp.count*2*sizeof(int);
+		//~ fwrite(writeStaging, 1, stagingSize, lexWord);
+		//~ /*fwrite(&temp.count, 1, sizeof(size_t), lexWord);
+		//~ fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
+		//~ fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
+		//~ fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
+		//~ fwrite(temp.locations, temp.count, sizeof(int), lexWord);
+		//~ fwrite(temp.values, temp.count, sizeof(int), lexWord);*/
+	//~ }else{
+		//~ /* saturation is too high, better to store dense */
+		//~ /* there's gotta be a better way to do this */
+		//~ *writeStaging = 0;
+		//~ stagingSize = sizeof(temp.count);
+		//~ memcpy(writeStaging+stagingSize, &RIVout.frequency, sizeof(int)*3);
+		//~ stagingSize += sizeof(int)*3;
+		//~ memcpy(writeStaging+stagingSize, RIVout.values, sizeof(int)*RIVSIZE);
+		//~ stagingSize +=sizeof(int)*RIVSIZE;
+		//~ fwrite(writeStaging, 1, stagingSize, lexWord);
+		//~ /*fwrite(&temp.count, 1, sizeof(size_t), lexWord);
+		//~ fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
+		//~ fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
+		//~ fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
+		//~ fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);*/
+	//~ }
-	fclose(lexWord);
 	free(output);
-	free(temp.locations);
 	return 0;
 }
 denseRIV* fLexPull(FILE* lexWord){
 	denseRIV *output = calloc(1,sizeof(denseRIV));
-	size_t typeCheck;
+	int typeCheck;
 	/* get metadata for vector */
-	fread(&typeCheck, 1, sizeof(size_t), lexWord);
+	if(!fread(&typeCheck, 1, sizeof(int), lexWord)){
-	fread(&output->frequency, 1, sizeof(int), lexWord);
+		return NULL;
-	fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
+	}
-	fread(&output->magnitude, 1, sizeof(float), lexWord);
 	/* first value stored is the value count if sparse, and 0 if dense */
 	if (typeCheck){
 		/* pull as sparseVector */
-		sparseRIV temp;
+		sparseRIV* temp = (sparseRIV*) (IOstagingSlot-(sizeof(sparseRIV)/sizeof(int)-IODISPLACEMENT));
-		/* value was not 0, so it's the value count */
+		assert(&temp->count == IOstagingSlot);
-		temp.count = typeCheck;
+		temp->count = typeCheck;
+		temp->locations = IOstagingSlot+4;
+		temp->values = temp->locations+temp->count;		
+		if (fread(&(temp->frequency), sizeof(int), (typeCheck* 2)+3, lexWord) != typeCheck*2 + 3){
+			printf("vector read failure");
+			return NULL;
+		}
-		temp.locations = (int*)malloc(temp.count*2*sizeof(int));
+		/*sparseRIV temp;
+		temp.count = typeCheck;
+		temp.locations = malloc(temp.count*2*sizeof(int));
 		temp.values = temp.locations+temp.count;
+		fread(&output->frequency, 1, sizeof(int), lexWord);
+		fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
+		fread(&output->magnitude, 1, sizeof(float), lexWord);
 		fread(temp.locations, temp.count, sizeof(int), lexWord);
 		fread(temp.values, temp.count, sizeof(int), lexWord);
+*/
-		addS2D(output->values, temp);
+		addS2D(output->values, *temp);
-		free(temp.locations);
 	}else{
 		/* typecheck is thrown away, just a flag in this case */
-		fread(output->values, RIVSIZE, sizeof(int), lexWord);
+		//~ fread(&output->frequency, 1, sizeof(int), lexWord);
+		//~ fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
+		//~ fread(&output->magnitude, 1, sizeof(float), lexWord);
+		if(fread(&output->frequency, sizeof(int), RIVSIZE+3, lexWord) != RIVSIZE+3){
+			printf("vector read failure");
+			return NULL;
+		}
 	}
@@ -254,7 +369,6 @@ denseRIV* fLexPull(FILE* lexWord){
 int cacheDump(){
 	int flag = 0;
 	for(int i = 0; i < CACHESIZE; i++){
 		if(RIVKey.RIVCache[i]){

--- a/RIVlexicon.h.gch
+++ b/RIVlexicon.h.gch
--- a/RIVread
+++ b/RIVread
--- a/RIVread.c
+++ b/RIVread.c
@@ -6,6 +6,7 @@
 #include <dirent.h>
 #include <error.h>
 #include <string.h>
+#define CACHESIZE 100000
 #include "RIVtools.h"
 //this program reads a directory full of files, and adds all context vectors (considering file as context)
@@ -17,6 +18,7 @@ void directoryGrind(char *rootString);
 void lineGrind(char* textLine);
 int main(int argc, char *argv[]){
 	char pathString[1000];
 	//we open the lexicon, if it does not yet exist, it will be created
@@ -69,7 +71,7 @@ void directoryGrind(char *rootString){
 			printf("skipped: %s\n", files->d_name); 
 			continue;
 		}
+		puts(files->d_name);
 		//open a file within root directory
 		FILE *input = fopen(pathString, "r");
 		if(input){
@@ -83,11 +85,11 @@ void directoryGrind(char *rootString){
 void fileGrind(FILE* textFile){
-	char textLine[5000];
+	char textLine[10000];
 	// included python script separates paragraphs into lines
+	int i=0;
-	while(fgets(textLine, 4999, textFile)){
+	while(fgets(textLine, 9999, textFile)){
+		printf("line: %d\n", i++);
 		if(!strlen(textLine)) continue;
 		if(feof(textFile)) break;
@@ -100,7 +102,11 @@ void fileGrind(FILE* textFile){
 void lineGrind(char* textLine){
 	//extract a context vector from this text set
 	sparseRIV contextVector = textToL2(textLine);
+	if(contextVector.contextSize <= 1){
+		free(contextVector.locations);
+		return;
+	}
 	denseRIV* lexiconRIV;
 	//identify stopping point in line read
 	char* textEnd = textLine + strlen(textLine)-1;
@@ -110,6 +116,7 @@ void lineGrind(char* textLine){
 		sscanf(textLine, "%99s%n", word, &displacement);
 		//we ensure that each word exists, and is free of unwanted characters
+		textLine += displacement+1;
 		if(!(*word))continue;
 		if(!isWordClean((char*)word)){
@@ -132,7 +139,7 @@ void lineGrind(char* textLine){
 		//and finally we push it back to the lexicon for permanent storage
 		lexPush(lexiconRIV);
-		textLine += displacement+1;
 	}
 	//free the heap allocated context vector data

--- a/RIVread.o
+++ b/RIVread.o
--- a/RIVtools.h.gch
+++ b/RIVtools.h.gch
--- a/graphdata.txt
+++ b/graphdata.txt
--- a/output.txt
+++ b/output.txt
--- a/test.py
+++ b/test.py
 import numpy as np
 import matplotlib.pyplot as plt
+import math
+def fit(x):
+    return 1*(1067+94500000/x)
+x = 7
+range = 0.15
+while(1):
+    range = input("gimmerange");
+    data = open("graphdata.txt", "r");
+    frequencies = [];
+    mags = [];
+    fitline = [];
+    i = 0;
+    for line in data:
+        segments = line.split(",")
+        freq = int(segments[1])
+        mag = float(segments[2])
+        name = segments[4];
+        if(freq>40000):
+             continue;
+        core = fit(freq)
+        fitmax = core*(1+range);
+        fitmin = core*(1-range);
+        if(mag >fitmax or mag < fitmin):
+            continue
+        frequencies.append(freq)
+        mags.append(mag)
+	fitline.append(fit(freq));
+	print("{} {} {}".format(name, freq, mag))
+        i+=1
-data = open("../code/RIVet/graphdata.txt", "r");
-frequencies = [];
+    #plt.scatter(frequencies, mags)
-mags = [];
+    plt.plot(frequencies, fitline, 'r^', frequencies, mags, 'bs')
-i = 0;
+    plt.show()
-for line in data:
+    x+=1
-    if(int(line.split(",")[1])>40000):
-         continue;
-    frequencies.append(int(line.split(",")[1]))
-    mags.append(float(line.split(",")[2]))
-    if(mags[i]>80 and frequencies[i]>7000 and frequencies[i]<15000):
-        print(line)
-    i+=1
-plt.scatter(frequencies, mags)
-plt.show()