updated lots of stuff

fe20c6f5 · etcart · 60856c1d · fe20c6f5 · fe20c6f5 · fe20c6f5
Commit fe20c6f5 authored May 09, 2018 by etcart
Showing with 1000 additions and 645 deletions
.stemnet2.txt.swp
.stemnete.txt.swp
RIVaccessories.h
RIVaccessories.h.gch
RIVclasses
RIVclasses.c
RIVclasses.o
RIVlexicon.h
RIVlexicon.h.gch
RIVread.c
runscriptUb.sh
saturation.c
someshit.c
stemconfig/dbtools.py
stemconfig/dbtools.pyc
stemconfig/stemconf
stemconfig/stemconf.c
stemconfig/stemconf.o
stemconfig/stemconfig
stemconfig/stemconfig.c
--- a/.stemnet2.txt.swp
+++ b/.stemnet2.txt.swp
--- a/.stemnete.txt.swp
+++ b/.stemnete.txt.swp
--- a/RIVaccessories.h
+++ b/RIVaccessories.h
@@ -4,6 +4,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "stemconfig/stemset.h"

 struct treenode{
 	void* data;
@@ -11,14 +12,14 @@ struct treenode{
 	struct treenode* links[26];
 	int downstream;
 	
-};
+}*nextNode;
+void stemInsert(struct treenode* node, char* letter, void* data);
 int treecut(struct treenode* node, char* letter);
-void stemInsert(struct treenode* node, char* letter, char* data);
-void RIVinsert(struct treenode* node, char* letter, void* data);
+
+void treeInsert(struct treenode* node, char* letter, void* data);
 void* treeSearch(struct treenode* node, char* letter);
 struct treenode* stemTreeSetup();

-
 /*isWordClean filters words that contain non-letter characters, and 
 * upperCase letters, allowing only the '_' symbol through
 */
@@ -64,27 +65,34 @@ int wordtoSeed(char* word){
 	return seed;
 }
 struct treenode* stemTreeSetup(){
-	FILE* netfile = fopen("stemnet2.txt", "r");
-	if(!netfile){
-		printf("no stemnet file");
+	FILE* wordFile = fopen("stemconfig/wordset.txt", "r");
+	if(!wordFile){
+		printf("no wordnet file");
 		return 0;
 	}
 	
-	struct treenode* rootNode = calloc(1, sizeof(struct treenode));
+	struct treenode* rootNode = calloc(treesize, sizeof(struct treenode));
+	nextNode = rootNode+1;
 	char word[100];
-	char stem[100];
-	
-	while(fscanf(netfile, "%s %s", word, stem)){
-	
-		if(feof(netfile)){
-			break;
-		}
+	char* stem = (char*)stemset;
+	int displacement;
+	while(fscanf(wordFile, "%s", word)){
+		
+		sscanf(stem, "%*s%n", &displacement);
+		stem[displacement] = '\0';
 		
-		stemInsert(rootNode, word, stem);
 		
+		stemInsert(rootNode, word, stem);
+		if(feof(wordFile)){
+			break;
+		}
+		stem += displacement+1;
 	}
+	fclose(wordFile);
 	return rootNode;
 }
+
+	
 void* treeSearch(struct treenode* node, char* letter){
 	
 	
@@ -100,15 +108,15 @@ void* treeSearch(struct treenode* node, char* letter){
 		return node->data;
 	}
 }
-void RIVinsert(struct treenode* node, char* letter, void* data){
+void stemInsert(struct treenode* node, char* letter, void* data){
 	
 	node->downstream++;
 	if(*(letter)){
 		if(!node->links[*(letter)-'a']){
-			node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
+			node->links[*(letter)-'a'] = nextNode++;
 			
 		}
-		RIVinsert(node->links[*(letter)-'a'], letter+1, data);
+		treeInsert(node->links[*(letter)-'a'], letter+1, data);
 		
 	}else{
 		
@@ -119,44 +127,47 @@ void RIVinsert(struct treenode* node, char* letter, void* data){
 		
 	}
 }
-void stemInsert(struct treenode* node, char* letter, char* data){
+void treeInsert(struct treenode* node, char* letter, void* data){
 	
 	node->downstream++;
 	if(*(letter)){
-		
 		if(!node->links[*(letter)-'a']){
 			node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
 			
 		}
-		stemInsert(node->links[*(letter)-'a'], letter+1, data);
+		treeInsert(node->links[*(letter)-'a'], letter+1, data);
 		
 	}else{
-		if(node->data) return;
-		node->data = calloc(strlen(data)+1, sizeof(char));
 		
+		if(node->data) return;
+		node->data = data;
 		
-		strcpy((char*)node->data, data);
+
 		
 	}
 }
+
 int treecut(struct treenode* node, char* letter){
 	node->downstream--;
 	int flag;
+	//continue searching downstream if there is a letter
 	if(*(letter)){
 		if(node->links[*(letter)-'a']){
-		
+			//propagate to next section
 			flag = treecut(node->links[*(letter)-'a'], letter+1);
+			//if next section returned a "cut" flag, 0 it out
 			if(flag){
 				node->links[*(letter)-'a'] = NULL;
 			}
 		}
-		if(!node->downstream){
-			
-			free(node);
-			return 1;
-		}
+	//there are no more letters, we've reached our destination
 	}else{
 		
+		node->data = NULL;
+	}
+	//this is on a branch that leads nowhere, free it and return "cut" flag
+	if(!node->downstream){
+			
 		free(node);
 		return 1;
 	}
@@ -164,5 +175,17 @@ int treecut(struct treenode* node, char* letter){
 	
 	
 }
+void destroyTree(struct treenode* node){
+	if(node->data) free(node->data);
+	for(int i=0; i<26; i++){
+		if(node->links[i]){
+			
+			destroyTree(node->links[i]);
+		}
+		
+	}
+	free(node);
+	
+}

 #endif
--- a/RIVaccessories.h.gch
+++ b/RIVaccessories.h.gch
--- a/RIVclasses
+++ b/RIVclasses
--- a/RIVclasses.c
+++ b/RIVclasses.c
 #include <stdio.h>
 #define RIVSIZE 50000
+#define CACHESIZE 20000
 #include "RIVtools.h"
-char* clean(char* word);
-char* stemmy(struct treenode* searchRoot, char* word);
-sparseRIV line2L3(char* text, struct treenode* searchRoot);
+#define k 5
 typedef char label[200];
 struct RIVclass{
 	label name;
 	sparseRIV* set;
 	int setSize;
 };
+char* clean(char* word);
+char* stemmy(struct treenode* searchRoot, char* word);
+sparseRIV line2L3(char* text, struct treenode* searchRoot);
+int kNearest(double* weights, struct RIVclass* classes, int classCount, denseRIV inQuestion);
+
+
 LEXICON* lexicon;
 int main(){
 	struct treenode* searchRoot = stemTreeSetup();
-	lexicon = lexOpen("consolidatedLexicon", "rx");
+	lexicon = lexOpen("lexiconEnron50-4", "rx");
 	
 	int classNo = 0;
 	
@@ -25,18 +30,38 @@ int main(){
 	
 	
 	
-	FILE* textSet = fopen("../../Downloads/labeledText.tsv", "r");
+	FILE* textSet = fopen("../../Downloads/trainingText.tsv", "r");
 	if(!textSet){
 		puts("no file");
 		return 1;
 	}
-	struct RIVclass* class;
+	struct RIVclass* class = 0;
 	char text[20000];
 	label className;
-	while(fscanf(textSet, "%s\t%s", text, className)){
+	//int j=0;
+	while(fscanf(textSet, "%[^\t]\t%[^\n]", text, className)){
+		//if(j++>100) break;
+		if(feof(textSet)) break;
 		
-		char* labelTemp = strstr(*classNames, className);
-		if(!labelTemp){
+		
+		sparseRIV temp = line2L3(text, searchRoot);
+		temp.magnitude = getMagnitudeSparse(temp);
+		if(temp.magnitude == 0){
+			printf("%s, empty\n", text);
+			continue;
+		}
+		
+		
+		//printf("%s, %s", text, className);
+		int i=0;
+		for(; i< classCount; i++){
+			if(!strcmp(className, classNames[i])){
+				classNo = i;
+				class = classes+classNo;
+				break;
+			}
+		}
+		if(i == classCount){
 			/* reinitialize the classnames with a new member */
 			classNames = realloc(classNames, (classCount+1)*sizeof(label));
 			strcpy(classNames[classCount], className);
@@ -53,14 +78,10 @@ int main(){
 			
 			classNo = classCount;
 			classCount++;
-		}else{
-			classNo = (labelTemp-*classNames);
-			class = classes+classNo;
-		
 		}
 		
 		class->set = realloc(class->set, (class->setSize+1) *sizeof(sparseRIV));
-		sparseRIV thing= line2L3(text, searchRoot);
+		sparseRIV thing= temp;
 		class->set[class->setSize] = thing;
 		class->setSize++;
 		
@@ -69,10 +90,71 @@ int main(){
 	
 	for(int i=0; i<classCount; i++){
 		puts(classNames[i]);
+		puts(classes[i].name);
 		printf("%d\n\n", classes[i].setSize);
 	}
-	
-	
+	fclose(textSet);
+	textSet = fopen("../../Downloads/validationText.tsv", "r");
+	if(!textSet) return 1;
+	int won = 0;
+	int docTotal = 0;
+	//scanf("%d", &won);
+	//j=0;
+	while(fscanf(textSet, "%[^\t]\t%[^\n]", text, className)){
+		if(feof(textSet)) break;
+		//if(j++>30) break;
+		
+		int i=0;
+		for(; i< classCount; i++){
+			if(!strcmp(className, classNames[i])){
+				classNo = i;
+				class = classes+classNo;
+				break;
+			}
+		}if(i == classCount){
+			printf("unclassifiable\n");
+			continue;
+		}
+		
+		
+		sparseRIV thing= line2L3(text, searchRoot);
+		if(thing.count ==0){
+			continue;
+		}
+		docTotal++;
+		denseRIV inQuestion = {0};
+		addS2D(inQuestion.values, thing);
+		inQuestion.magnitude = getMagnitudeDense(&inQuestion);
+		
+		double weights[classCount];
+		
+		int choice = kNearest(weights, classes, classCount, inQuestion);
+		if(choice == -1){
+			printf("classificationFailed");
+			
+		}else{
+			//puts(text);
+			printf("survey says! %s  ", className);
+			printf("your asnwer was...%d, %s\n", choice, classes[choice].name);
+		}
+		if(choice == classNo){
+			won++;
+		}
+		free(thing.locations);
+	}
+	printf("\n\n we got %d/%d ", won, docTotal);
+	for(int i=0; i<classCount; i++){
+		for(int j=0; j<classes[i].setSize; j++){
+			free(classes[i].set[j].locations);
+		}
+		free(classes[i].set);
+		
+	}
+	free(classes);
+	free(classNames);
+	destroyTree(searchRoot);
+	lexClose(lexicon);
+	fclose(textSet);
 	return 0;
 	
 }
@@ -132,26 +214,74 @@ sparseRIV line2L3(char* text, struct treenode* searchRoot){
 				continue;
 			}else{
 				//printf("%s, succesfully pulled\n", stem);
-				temp = consolidateD2S(wordRIV->values);
-				
+				temp = normalize(*wordRIV, 10000);
+				//temp = consolidateD2S(wordRIV->values);
 				addS2D(accumulate.values, temp);
 				
 				
 				free(temp.locations);
-				free(wordRIV);
+				//free(wordRIV);
+				lexPush(lexicon, wordRIV);
 			}
 		}
 	}
 	temp = consolidateD2S(accumulate.values);
 	return temp;
-	
-
+}
+int kNearest(double* weights, struct RIVclass* classes, int classCount, denseRIV inQuestion){
+	int choice = -1;
+	memset(weights, 0, classCount*sizeof(double));
+	double distances[k] = {-2};

+	int labels[k] = {0};
+	int fill = 0;
+	for(int i=0; i<classCount; i++){
+		for(int j=0; j<classes[i].setSize; j++){
+			double cosine = cosCompare(inQuestion, classes[i].set[j]);
+			
+			if(fill < k){
+				distances[fill] = cosine;
+				fill++;
+				continue;
+			}
+				
+			for(int x = 0; x<k; x++){
+				
+				
+				if(cosine>distances[x]){
+					distances[x] = cosine;
+					labels[x] = i;
+					
+					break;
+				}
+			}
+			
+		}
+	}
+	double totalweight = 0;
+	for(int i=0; i<classCount; i++){
+		for(int j = 0; j<k; j++){
+			if(labels[j] == i){
+				weights[i] += distances[j];
+				totalweight += distances[j];
+			}
+			
+		}
+	}
 	
+	
+	double tempmax = -2;
+	for(int i=0; i<classCount; i++){
+		weights[i] /= totalweight;
+		if(weights[i] > tempmax){
+			choice = i;
+			tempmax = weights[i];
+		}
+	}
+	return choice;
 }
 		
 		
-		
 		 
 	
 	

--- a/RIVclasses.o
+++ b/RIVclasses.o
--- a/RIVlexicon.h
+++ b/RIVlexicon.h
-#ifndef RIV_LEXICON_H
-#define RIV_LEXICON_H
-
-#include "RIVLower.h"
-#include "RIVaccessories.h"
-#include "assert.h"
-
-#ifndef READFLAG
-#define READFLAG 0x01
-#endif
-
-#ifndef WRITEFLAG
-#define WRITEFLAG 0x02
-#endif
-
-#ifndef INCFLAG 
-#define INCFLAG 0x04
-#endif
-
-#ifndef CACHEFLAG
-#define CACHEFLAG 0x08
-#endif
-
-#ifndef SORTCACHE
-	#ifndef HASHCACHE
-		#define SORTCACHE
-	#endif
-#endif
-typedef struct{
-	char lexName[100];
-	denseRIV* *cache;
-	struct cacheList* listPoint;
-	char flags;
-	#ifdef SORTCACHE
-	struct treenode* treeRoot;
-	#endif /* SORTCACHE */
-}LEXICON;
-struct cacheList{
-	denseRIV* *cache;
-	struct cacheList* next;
-	struct cacheList* prev;
-}*rootCache = NULL;
-
-#define IODISPLACEMENT   (sizeof(((sparseRIV*)0)->count)\
-						+ sizeof(((sparseRIV*)0)->frequency)\
-						+ sizeof(((sparseRIV*)0)->contextSize)\
-						+ sizeof(((sparseRIV*)0)->magnitude))\
-						/ sizeof(int)
-int* IOstagingSlot = RIVKey.h_tempBlock+RIVSIZE; //#TODO format this better
-
-
-/* lexOpen is called to "open the lexicon", setting up for later calls to
- * lexPush and lexPull. if the lexicon has not been opened before calls
- * to these functions, their behavior can be unpredictable, most likely crashing
- */
-LEXICON* lexOpen(const char* lexName, const char* flags);
-
-/* lexClose should always be called after the last lex push or lex pull call
- * if the lexicon is left open, some vector data may be lost due to 
- * un-flushed RIV cache
- */
-void lexClose(LEXICON*);
-
-
-/* both lexPush and lexPull must be called *after* the lexOpen() function
- * and after using them the lexClose() function must be called to ensure
- * data security */
- 
-/* lexPush writes a denseRIV to the lexicon for permanent storage */
-int lexPush(LEXICON* lexicon, denseRIV* RIVout);
-
-/* cacheCheckOnPush tests the state of this vector in our lexicon cache
- * and returns 1 on "success" indicating cache storage and no need to push to file
- * or returns 0 on "failure" indicating that the vector need be pushed to file 
- */
-int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout);
-
-/* lexPull reads a denseRIV from the lexicon, under "word"
- * if the file does not exist, it creates a 0 vector with the name of word
- * lexPull returns a denseRIV *pointer* because its data must be tracked 
- * globally for key optimizations
- */
-denseRIV* lexPull(LEXICON* lexicon, char* word);
-
-/* cacheCheckonPull checks if the word's vector is stored in cache,
- * and returns a pointer to that vector on success
- * or returns a NULL pointer if the word is not cached, indicating a need 
- * to pull from file
- */
-denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word);
-
-/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
- * saving it for long-term aggregation.  function is called by "lexPush",
- * which is what users should actually use.  lexPush, unlike fLexPush,
- * has cache logic under the hood for speed and harddrive optimization
- */
-int fLexPush(LEXICON* lexicon, denseRIV* RIVout);
-
-/* flexPull pulls data directly from a file and converts it (if necessary)
- * to a denseRIV.  function is called by "lexPull" which is what users 
- * should actually use.  lexPull, unlike FlexPull, has cache logic under
- * the hood for speed and harddrive optimization 
- */
-denseRIV* fLexPull(FILE* lexWord);
-
-/* redefines signal behavior to protect cached data against seg-faults etc*/
-void signalSecure(int signum, siginfo_t *si, void* arg);
-
-/* used exclusively by flexpush to determine write-style (sparse or dense)
- * and also formats the "IOstagingSlot" for fwrite as a single block if sparse
- */
-int saturationForStaging(denseRIV* output);
-/* begin definitions */
-LEXICON* lexOpen(const char* lexName, const char* flags){
-	LEXICON* output = calloc(1, sizeof(LEXICON));
-	/* identify the presence of read, write, and exclusive flags */
-	char* r = strstr(flags, "r");
-	char* w = strstr(flags, "w");
-	char* x = strstr(flags, "x");
-	struct stat st = {0};
-	
-	
-	if(w){
-		/* if set to write, we check and create if necessary, the lexicon */
-		if (stat(lexName, &st) == -1) {
-			mkdir(lexName, 0777);
-		}	
-		output->flags |= WRITEFLAG;
-	}else if(r){
-		/* if set to read and not write, return null if lexicon does not exist */
-		if (stat(lexName, &st) == -1) {
-			free(output);
-			return NULL;
-		}	
-		output->flags |= READFLAG;
-	}
-		/* if not set to exclusive, set the inclusive flag */
-	if(!x){
-		output->flags |= INCFLAG;
-	}
-	strcpy(output->lexName, lexName);
-	
-	
-	#if CACHESIZE > 0
-	
-	if(r && w){
-		//#TODO include hash vs sort cache logic flags
-		/* if we will be reading and writing the same lexicon, setup a
-		 * cache for this lexicon to speed up rewrites */
-		struct cacheList* newCache = calloc(1, sizeof(struct cacheList));
-		#ifdef HASHCACHE
-		newCache->cache = calloc(CACHESIZE, sizeof(denseRIV*));
-		#else
-		#ifdef SORTCACHE
-		newCache->cache = calloc(CACHESIZE+1, sizeof(denseRIV*));
-		output->treeRoot = calloc(1, sizeof(struct treenode));
-		#endif
-		#endif
-		output->flags |= CACHEFLAG;
-		
-		output->cache = newCache->cache;
-		newCache->next = rootCache;
-		if(rootCache){
-			rootCache->prev = newCache;
-		}
-		rootCache = newCache;
-		output->listPoint = newCache;
-		
-		struct sigaction action = {0};
-		action.sa_sigaction = signalSecure;
-		action.sa_flags = SA_SIGINFO;
-		
-		for(int i=1; i<27; i++){
-			sigaction(i,&action,NULL);
-		}
-	}
-	#endif
-
-	return output;
-}
-void lexClose(LEXICON* toClose){
-	
-#if CACHESIZE>0 
-	if(toClose->flags & CACHEFLAG){
-		if(cacheDump(toClose->cache)){
-			puts("cache dump failed, some lexicon data was lost");
-		}
-		struct cacheList* listPoint = toClose->listPoint;
-		if(listPoint->prev){
-			listPoint->prev->next = toClose->listPoint->next;
-		}
-		if(listPoint->next){
-			listPoint->next->prev = toClose->listPoint->prev;
-		}
-		free(listPoint);
-	}
-#endif
-	free(toClose);
-}
-
-
-
-#if CACHESIZE > 0
-denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
-	#ifdef HASHCACHE
-	srand(wordtoSeed(word));
-	int hash = rand()%CACHESIZE;
-	if(lexicon->cache[hash]){
-		if(!strcmp(word, lexicon->cache[hash]->name)){
-			/* if word is cached, pull from cache and exit */
-			return lexicon->cache[hash];
-		}
-	}
-	return NULL;
-	#endif
-	#ifdef SORTCACHE
-	
-	return treeSearch(lexicon->treeRoot, word);
-
-	#endif
-}
-
-int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
-	
-	/* if our RIV was cached already, no need to play with it */
-	if(RIVout->cached == lexicon){
-		return 1;
-	}
-	#ifdef HASHCACHE
-	srand(wordtoSeed(RIVout->name));
-	int hash = rand()%CACHESIZE;
-	
-	/* if there is no word in this cache slot */
-	if(!lexicon->cache[hash]){
-		/* push to cache instead of file */
-		lexicon->cache[hash] = RIVout;
-		lexicon->cache[hash]->cached = lexicon;
-		return 1;
-	/*if the current RIV is more frequent than the RIV holding its slot */
-	}
-	if(RIVout->frequency > lexicon->cache[hash]->frequency ){
-		/* push the lower frequency cache entry to a file */
-		fLexPush(lexicon, lexicon->cache[hash]);
-		/* replace this cache-slot with the current vector */
-
-		lexicon->cache[hash] = RIVout;
-		lexicon->cache[hash]->cached = lexicon;
-		
-		return 1;
-	}
-	return 0;
-	#endif /* HASHCACHE */
-	#ifdef SORTCACHE
-	denseRIV* *cache_slider = lexicon->cache;
-	while(*cache_slider){
-		if(RIVout->frequency > (*cache_slider)->frequency){
-			memcpy(cache_slider+1, cache_slider, CACHESIZE-(cache_slider-lexicon->cache));
-			if(lexicon->cache[CACHESIZE]){
-				
-				fLexPush(lexicon, lexicon->cache[CACHESIZE]);
-				//remove tree element
-				treecut(lexicon->treeRoot, RIVout->name);
-				lexicon->cache[CACHESIZE] = NULL;
-			}
-			RIVout->cached = lexicon;
-			*cache_slider = RIVout;
-			//add tree element
-			RIVinsert(lexicon->treeRoot, RIVout->name, RIVout);
-			
-			return 1;
-		}
-			
-		cache_slider++;
-	}
-	if(cache_slider-lexicon->cache < CACHESIZE){
-		RIVout->cached = lexicon;
-		*cache_slider = RIVout;
-		RIVinsert(lexicon->treeRoot, RIVout->name, RIVout);
-		//add tree element
-		return 1;
-	}
-	return 0;
-	#endif /* SORTCACHE */
-}
-
-#endif
-denseRIV* lexPull(LEXICON* lexicon, char* word){
-	
-	denseRIV* output = NULL;
-	
-	#if CACHESIZE > 0
-	if(lexicon->flags & CACHEFLAG){
-		/* if there is a cache, first check if the word is cached */
-		if((output = cacheCheckOnPull(lexicon, word))){
-			return output;
-		}
-	}
-	#endif /* CACHESIZE > 0 */
-
-	/* if not, attempt to pull the word data from lexicon file */
-	char pathString[200];
-
-	sprintf(pathString, "%s/%s", lexicon->lexName, word);
-
-	FILE *lexWord = fopen(pathString, "rb");
-
-	/* if this lexicon file already exists */
-	if(lexWord){
-		/* pull data from file */
-		
-		output = fLexPull(lexWord);
-		strcpy(output->name, word);
-		fclose(lexWord);
-	}else{
-		/* if lexicon is set to inclusive (can gain new words) */
-		if(lexicon->flags & INCFLAG){
-			
-			/*if file does not exist, return a 0 vector (word is new to the lexicon) */
-			output = calloc(1, sizeof(denseRIV));
-			strcpy(output->name, word);
-		}
-		/*if lexicon is set to exclusive, will return a NULL pointer instead of a 0 vector */
-	}
-
-	
-
-	return output;
-}
-
-int lexPush(LEXICON* lexicon, denseRIV* RIVout){
-	
-	#if CACHESIZE > 0
-	if(lexicon->flags & CACHEFLAG){
-	/* check the cache to see if it belongs in cache */
-		if(cacheCheckOnPush(lexicon, RIVout)){
-			/* if the cache check returns 1, it has been dealth with in cache */
-			return 0;
-		}
-	}
-	
-	#endif
-	
-	
-	/* push to the lexicon */
-	return fLexPush(lexicon, RIVout);
-	
-}
-
-int saturationForStaging(denseRIV* output){
-	
-	/* key/value pairs will be loaded to a worst-case sized temporary slot */
-
-	int* count = IOstagingSlot;
-	*count = 0;
-	*(count+1) = 0;
-	*(count+2) = output->frequency;
-	*(count+3) = output->contextSize;
-	*(float*)(count+4) = output->magnitude;
-	
-	int* locations = IOstagingSlot+5;
-	int* values = IOstagingSlot-RIVSIZE;;
-	int* locations_slider = locations;
-	int* values_slider = values;
-	for(int i=0; i<RIVSIZE; i++){
-		
-		/* act only on non-zeros */
-		if(output->values[i]){
-			
-			/* assign index to locations */
-			*(locations_slider++) = i;
-			
-			/* assign value to values */
-			*(values_slider++) = output->values[i];
-			
-			/* track size of forming sparseRIV */
-			*count += 1;
-		}
-	}
-		
-	/* copy values into slot immediately after locations */
-	memcpy(locations+*count, values, (*count)*sizeof(int));
-	
-	return *count;
-}
-int fLexPush(LEXICON* lexicon, denseRIV* output){	
-	char pathString[200] = {0};
-	
-	/* word data will be placed in a (new?) file under the lexicon directory
-	 * in a file named after the word itself */
-	sprintf(pathString, "%s/%s", lexicon->lexName, output->name);
-	
-	int saturation = saturationForStaging(output);
-	
-	if( saturation < RIVSIZE/2){
-		
-		FILE *lexWord = fopen(pathString, "wb");
-		if(!lexWord){
-			printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
-			return 1;
-		}
-		fwrite(IOstagingSlot, (saturation*2)+5, sizeof(int), lexWord);
-		fclose(lexWord);
-	}else{
-		output->cached = 0;
-		FILE *lexWord = fopen(pathString, "wb");
-		if(!lexWord){
-			printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
-			return 1;
-		}
-		fwrite(((int*)&output->cached), sizeof(int), RIVSIZE+5, lexWord);
-		
-		fclose(lexWord);
-	}
-	
-	free(output);
-
-	return 0;
-}
-
-denseRIV* fLexPull(FILE* lexWord){
-	denseRIV *output = calloc(1,sizeof(denseRIV));
-	size_t typeCheck;
-	/* get metadata for vector */
-	if(!fread(&typeCheck, 1, sizeof(size_t), lexWord)){
-		return NULL;
-	}
-	int flag = 0;
-	/* first value stored is the value count if sparse, and 0 if dense */
-	if (typeCheck){
-		/* pull as sparseVector */
-		/*sparseRIV* temp = (sparseRIV*) (IOstagingSlot-(sizeof(sparseRIV)/sizeof(int)-IODISPLACEMENT));
-		
-		temp->count = typeCheck;
-		temp->locations = IOstagingSlot+5;
-		temp->values = temp->locations+temp->count;		
-		
-		if (fread(&(temp->frequency), sizeof(int), (typeCheck* 2)+3, lexWord) != typeCheck*2 + 3){
-			printf("vector read failure");
-			return NULL;
-		}*/
-	
-		sparseRIV temp;
-		temp.count = typeCheck;
-		temp.locations = malloc(temp.count*2*sizeof(int));
-		temp.values = temp.locations+temp.count;
-		flag+= fread(&output->frequency, 1, sizeof(int), lexWord);
-		flag += fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
-		flag+= fread(&output->magnitude, 1, sizeof(float), lexWord);
-		flag += fread(temp.locations, temp.count, sizeof(int), lexWord);
-		flag+= fread(temp.values, temp.count, sizeof(int), lexWord);
-
-
-		addS2D(output->values, temp);
-	}else{
-		/* typecheck is thrown away, just a flag in this case */
-		flag+= fread(&output->frequency, 1, sizeof(int), lexWord);
-		flag += fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
-		flag +=fread(&output->magnitude, 1, sizeof(float), lexWord);
-		/*if(fread(&output->frequency, sizeof(int), RIVSIZE+3, lexWord) != RIVSIZE+3){
-			printf("vector read failure");
-			return NULL;
-		}*/
-	}
-
-
-	output->cached = 0;
-
-	return output;
-
-}
-
-void signalSecure(int signum, siginfo_t *si, void* arg){
-	while(rootCache){
-		if(cacheDump(rootCache->cache)){
-			puts("cache dump failed, some lexicon data lost");
-		}
-		rootCache = rootCache->next;
-		
-	}
-	signal(signum, SIG_DFL);
-	kill(getpid(), signum);
-}
-int cacheDump(denseRIV* *toDump){
-	
-	int flag = 0;
-	denseRIV* *toDump_slider = toDump;
-	#ifdef HASHCACHE
-	denseRIV* *toDump_stop = toDump+CACHESIZE;
-	while(toDump_slider<toDump_stop){
-		if(*toDump_slider){
-
-			flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
-		}
-		toDump_slider++;
-	}
-	#else
-	#ifdef SORTCACHE
-	while(*toDump_slider){
-		flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
-		
-		toDump_slider++;
-	
-	}
-	#endif
-	#endif
-	free(toDump);
-	
-	return flag;
-}
-#endif
+#ifndef RIV_LEXICON_H
+#define RIV_LEXICON_H
+
+#include "RIVLower.h"
+#include "RIVaccessories.h"
+#include "assert.h"
+
+/* these flags will be used by the lexicon to know its permissions and states */
+#ifndef READFLAG
+#define READFLAG 0x01
+#endif
+
+#ifndef WRITEFLAG
+#define WRITEFLAG 0x02
+#endif
+
+#ifndef INCFLAG 
+#define INCFLAG 0x04
+#endif
+
+#ifndef CACHEFLAG
+#define CACHEFLAG 0x08
+#endif
+
+/* if user has specified neither hashed nor sorted cache we assume sorted
+ * hashed strategy is extremely CPU and memory light, but very inneffective 
+ * at ensuring the most important vectors are cached. as such it is better
+ * optimized for RAMdisks and unusually fast SSDs.  the sorted strategy
+ * is much more expensive for the CPU, but ensures the minimum possible 
+ * hard-drive read writes far more effectively */
+
+#ifndef SORTCACHE
+	#ifndef HASHCACHE
+		#define SORTCACHE
+	#endif
+#endif
+/* the LEXICON struct will be used similar to a FILE (as a pointer) which
+ * contains all metadata that a lexicon needs in order to be read and written to safely*/
+typedef struct{
+	char lexName[100];
+	denseRIV* *cache;
+	struct cacheList* listPoint;
+	char flags;
+	#ifdef SORTCACHE
+	/* if our cache is sorted, we will need a search tree and a saturation */
+	struct treenode* treeRoot;
+	int cacheSaturation;
+	denseRIV* *cache_slider;
+	#endif /* SORTCACHE */
+}LEXICON;
+/* this will form a linked list of caches, so that all data can be safely dumped
+ * in event of an error, no matter how many or how strangely lexica have
+ * been opened and closed */
+struct cacheList{
+	denseRIV* *cache;
+	struct cacheList* next;
+	struct cacheList* prev;
+}*rootCache = NULL;
+
+/* IOstagingSlot is used by fLexPush to preformat data to be written in a single
+ * fwrite() call.  it has room for RIVSIZE integers behind it and 2*RIVSIZE
+ * integers ahead of it, which the function saturationForStaging() will need */
+int* IOstagingSlot = RIVKey.h_tempBlock+RIVSIZE;
+
+/* lexOpen is called to "open the lexicon", setting up for later calls to
+ * lexPush and lexPull. if the lexicon has not been opened before calls
+ * to these functions, their behavior can be unpredictable, most likely crashing
+ * lexOpen accepts flags: r, w, x.
+ * r: for reading, currently meaningless, it wont stop you reading if you don't have this
+ * w: for writing. if a readonly lexicon is "written to" no data will be saved in hardcopy
+ * although it will be cached if possible, so that later pulls will be optimized
+ * x: exclusive. will not accept new words, lexPull returns a NULL pointer
+ * and lexPush simply frees any word which is not already in the lexicon
+ */
+LEXICON* lexOpen(const char* lexName, const char* flags);
+
+/* lexClose should always be called after the last lex push or lex pull call
+ * if the lexicon is left open, some vector data may be lost due to 
+ * un-flushed RIV cache.  also frees up data, memory leaks if lexicon is not closed
+ */
+void lexClose(LEXICON*);
+
+/* both lexPush and lexPull must be called *after* the lexOpen() function
+ * and after using them the lexClose() function must be called to ensure
+ * data security (only after the final push or pull, not regularly during operation */
+ 
+/* lexPush writes a denseRIV to the lexicon for permanent storage */
+int lexPush(LEXICON* lexicon, denseRIV* RIVout);
+
+/* lexPull reads a denseRIV from the lexicon, under "word"
+ * if the file does not exist, it creates a 0 vector with the name of word
+ * lexPull returns a denseRIV *pointer* because its data must be tracked 
+ * globally for key optimizations
+ */
+denseRIV* lexPull(LEXICON* lexicon, char* word);
+
+/* cacheCheckOnPush tests the state of this vector in our lexicon cache
+ * and returns 1 on "success" indicating cache storage and no need to push to file
+ * or returns 0 on "failure" indicating that the vector need be pushed to file 
+ */
+int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout);
+
+/* cacheCheckonPull checks if the word's vector is stored in cache,
+ * and returns a pointer to that vector on success
+ * or returns a NULL pointer if the word is not cached, indicating a need 
+ * to pull from file
+ */
+denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word);
+
+/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
+ * saving it for long-term aggregation.  function is called by "lexPush",
+ * which is what users should actually use.  lexPush, unlike fLexPush,
+ * has cache logic under the hood for speed and harddrive optimization
+ */
+int fLexPush(LEXICON* lexicon, denseRIV* RIVout);
+
+/* flexPull pulls data directly from a file and outputs it as a denseRIV.
+ * function is called by "lexPull" which is what users 
+ * should actually use.  lexPull, unlike FlexPull, has cache logic under
+ * the hood for speed and harddrive optimization 
+ */
+denseRIV* fLexPull(FILE* lexWord);
+
+/* redefines signal behavior to protect cached data against seg-faults etc*/
+void signalSecure(int signum, siginfo_t *si, void* arg);
+int cacheDump(denseRIV* *toDump);
+
+/* used exclusively by flexpush to determine write-style (sparse or dense)
+ * and also formats the "IOstagingSlot" for fwrite as a single block if sparse
+ */
+int saturationForStaging(denseRIV* output);
+/* begin definitions */
+LEXICON* lexOpen(const char* lexName, const char* flags){
+	LEXICON* output = calloc(1, sizeof(LEXICON));
+	/* identify the presence of read, write, and exclusive flags */
+	char* r = strstr(flags, "r");
+	char* w = strstr(flags, "w");
+	char* x = strstr(flags, "x");
+	struct stat st = {0};
+	
+	
+	if(w){
+		/* if set to write, we check and create if necessary, the lexicon */
+		if (stat(lexName, &st) == -1) {
+			mkdir(lexName, 0777);
+		}
+		/* flag for writing*/
+		output->flags |= WRITEFLAG;
+	}else if(r){
+		/* if set to read and not write, return null if lexicon does not exist */
+		if (stat(lexName, &st) == -1) {
+			free(output);
+			return NULL;
+		}	
+		/* flag for reading */
+		output->flags |= READFLAG;
+	}
+		/* if not set to exclusive, set the inclusive flag */
+	if(!x){
+		/* flag inclusive (will return unknown words as 0 vector */
+		output->flags |= INCFLAG;
+	}
+	/* record the name of the lexicon */
+	strcpy(output->lexName, lexName);
+	
+	#if CACHESIZE > 0
+	output->cache = calloc(CACHESIZE, sizeof(denseRIV*));
+
+
+	#ifdef SORTCACHE
+	/* a sorted cache needs a search tree for finding RIVs by name */
+	output->treeRoot = calloc(1, sizeof(struct treenode));
+	output->cacheSaturation = 0;
+	output->cache_slider = output->cache+CACHESIZE;
+	#endif /* SORTCACHE */
+	
+	/* flag cached ?? */ 
+	output->flags |= CACHEFLAG;
+	if(w){
+		/* setup cache-list element for break dumping */
+		struct cacheList* newCache = calloc(1, sizeof(struct cacheList));
+		newCache->cache = output->cache;
+		
+		newCache->next = rootCache;
+		if(rootCache){
+			rootCache->prev = newCache;
+		}
+		rootCache = newCache;
+		output->listPoint = newCache;
+
+		struct sigaction action = {0};
+		action.sa_sigaction = signalSecure;
+		action.sa_flags = SA_SIGINFO;
+		
+		for(int i=1; i<27; i++){
+			sigaction(i,&action,NULL);
+		}
+	}
+
+	#endif /* CACHESIZE > 0 */
+
+	return output;
+}
+void lexClose(LEXICON* toClose){
+	
+#if CACHESIZE>0 
+	if(toClose->flags & WRITEFLAG){
+		if(cacheDump(toClose->cache)){
+			puts("cache dump failed, some lexicon data was lost");
+		}
+		struct cacheList* listPoint = toClose->listPoint;
+		if(listPoint->prev){
+			listPoint->prev->next = toClose->listPoint->next;
+		}
+		if(listPoint->next){
+			listPoint->next->prev = toClose->listPoint->prev;
+		}
+		free(listPoint);
+	}
+#endif
+	free(toClose);
+}
+
+
+
+#if CACHESIZE > 0
+denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
+	#ifdef HASHCACHE
+	/* we find which cache entry this word belongs in by simple hashing */
+	srand(wordtoSeed(word));
+	int hash = rand()%CACHESIZE;
+	if(lexicon->cache[hash]){
+		if(!strcmp(word, lexicon->cache[hash]->name)){
+			/* if word is cached, pull from cache and exit */
+			return lexicon->cache[hash];
+		}
+	}
+	return NULL;
+	#endif
+	#ifdef SORTCACHE
+	/* use a treeSearch (found in RIVaccessories) to find the denseRIV* in the cache */
+	return treeSearch(lexicon->treeRoot, word);
+
+	#endif
+}
+
+int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
+	
+	/* if our RIV was cached already, no need to play with it */
+	if(RIVout->cached == lexicon){
+		/* return "success" the vector is already in cache and updated */
+		return 1;
+	}
+	#ifdef HASHCACHE
+	srand(wordtoSeed(RIVout->name));
+	int hash = rand()%CACHESIZE;
+	
+	/* if there is no word in this cache slot */
+	if(!lexicon->cache[hash]){
+		/* push to cache instead of file */
+		lexicon->cache[hash] = RIVout;
+		lexicon->cache[hash]->cached = lexicon;
+		/* return "success" */
+		return 1;
+	/*if the current RIV is more frequent than the RIV holding its slot */
+	}
+	if(RIVout->frequency > lexicon->cache[hash]->frequency ){
+		/* push the lower frequency cache entry to a file */
+		fLexPush(lexicon, lexicon->cache[hash]);
+		/* replace this cache-slot with the current vector */
+
+		lexicon->cache[hash] = RIVout;
+		lexicon->cache[hash]->cached = lexicon;
+		/* return "success" */
+		return 1;
+	}
+	return 0;
+	#endif /* HASHCACHE */
+	#ifdef SORTCACHE
+	
+	/* if the cache is not yet full, append this vector to the accumulating list */
+	if (lexicon->cacheSaturation < CACHESIZE){
+		RIVout->cached = lexicon;
+		lexicon->cache[lexicon->cacheSaturation] = RIVout;
+		treeInsert(lexicon->treeRoot, RIVout->name, RIVout);	
+		
+		
+		lexicon->cacheSaturation = lexicon->cacheSaturation+1;
+		/* return "success" */
+		return 1;
+	}else{ /* if cache is full */
+		
+		RIVout->cached = lexicon;
+		denseRIV* toCheck = RIVout;
+		denseRIV* temp;
+		
+		while(1){
+			if(lexicon->cache_slider == lexicon->cache){
+				lexicon->cache_slider += CACHESIZE;
+			}		
+			(lexicon->cache_slider)--;
+			if(toCheck->frequency > (*lexicon->cache_slider)->frequency){
+				temp = (*lexicon->cache_slider);
+				(*lexicon->cache_slider) = toCheck;
+				toCheck = temp;
+			}else{
+				if(toCheck == RIVout){
+					return 0;
+				}else{
+					treecut(lexicon->treeRoot, toCheck->name);
+					fLexPush(lexicon, toCheck);
+					treeInsert(lexicon->treeRoot, RIVout->name, RIVout);
+					return 1;
+				}
+				break;
+			}			
+		}
+	}
+	/* return "failure" */
+	return 0;
+	
+	#endif /* SORTCACHE */
+}
+
+#endif
+denseRIV* lexPull(LEXICON* lexicon, char* word){
+	
+	denseRIV* output = NULL;
+	
+	#if CACHESIZE > 0
+	if(lexicon->flags & CACHEFLAG){
+		/* if there is a cache, first check if the word is cached */
+		if((output = cacheCheckOnPull(lexicon, word))){
+			return output;
+		}
+	}
+	#endif /* CACHESIZE > 0 */
+
+	/* if not, attempt to pull the word data from lexicon file */
+	char pathString[200];
+
+	sprintf(pathString, "%s/%s", lexicon->lexName, word);
+
+	FILE *lexWord = fopen(pathString, "rb");
+
+	/* if this lexicon file already exists */
+	if(lexWord){
+		/* pull data from file */
+		
+		output = fLexPull(lexWord);
+		if(!output){
+			return NULL;
+		}
+		/* record the "name" of the vector, as the word */
+		strcpy(output->name, word);
+		fclose(lexWord);
+	}else{
+		/* if lexicon is set to inclusive (can gain new words) */
+		if(lexicon->flags & INCFLAG){
+			
+			/*if file does not exist, return a 0 vector (word is new to the lexicon) */
+			output = calloc(1, sizeof(denseRIV));
+			/* record the "name" of the vector, as the word */
+			strcpy(output->name, word);
+		}else{
+			/*if lexicon is set to exclusive, will return a NULL pointer instead of a 0 vector */
+			return NULL;
+		}
+		
+	}
+	return output;
+}
+
+int lexPush(LEXICON* lexicon, denseRIV* RIVout){
+	
+	#if CACHESIZE > 0
+	if(lexicon->flags & CACHEFLAG){
+	/* check the cache to see if it belongs in cache */
+		if(cacheCheckOnPush(lexicon, RIVout)){
+			/* if the cache check returns 1, it has been dealth with in cache */
+			return 0;
+		}
+	}
+	
+	#endif
+	
+	if(lexicon->flags & WRITEFLAG){
+		/* push to the lexicon */
+		return fLexPush(lexicon, RIVout);
+	}else{
+		/* free and return */
+		free(RIVout);
+		return 0;
+	}
+	
+}
+
+int saturationForStaging(denseRIV* output){
+	
+	/* IOstagingSlot is a reserved block of heap memory used for this (and other)
+	 * purposes. in this function, all of the metadata to be written along with a
+	 * sparse representation of the vector, will be laid into the IOstagingSlot
+	 * in the necessary format for writing and reading again */	
+	int* count = IOstagingSlot;
+	/* count, requires an 8 byte slot for reasons of compatibility between 
+	 * dense and sparse. it takes up two integers (int* count and count+1); */
+	*count = 0;
+	*(count+1) = 0;
+	*(count+2) = output->frequency;
+	*(count+3) = output->contextSize;
+	/* TODO fix this to allow magnitude to be changed to double easily */
+	*(float*)(count+4) = output->magnitude;
+	
+	/* locations will be laid in immediately after the metadata */
+	int* locations = IOstagingSlot+5;
+	/* values will be laid in *before* metadata, to be copied after locations,
+	 * once the size of the values and locations arrays are known.  there is,
+	 * by description of the stagingSlot, enough room for a 
+	 * completely saturated vector without conflict */
+	int* values = IOstagingSlot-RIVSIZE;;
+	int* locations_slider = locations;
+	int* values_slider = values;
+	for(int i=0; i<RIVSIZE; i++){
+		
+		/* act only on non-zeros */
+		if(output->values[i]){
+			
+			/* assign index to locations */
+			*(locations_slider++) = i;
+			
+			/* assign value to values */
+			*(values_slider++) = output->values[i];
+			
+			/* track size of forming sparseRIV */
+			*count += 1;
+		}
+	}
+		
+	/* copy values into slot immediately after locations */
+	memcpy(locations+*count, values, (*count)*sizeof(int));
+	
+	/* return number of non-zeros */
+	return *count;
+}
+int fLexPush(LEXICON* lexicon, denseRIV* output){	
+	char pathString[200] = {0};
+	
+	/* word data will be placed in a (new?) file under the lexicon directory
+	 * in a file named after the word itself */
+	sprintf(pathString, "%s/%s", lexicon->lexName, output->name);
+	
+	/* saturationForStaging returns the number of non-zero elements in the vector
+	 * and, in the process, places the data of the vector, in sparse format, in the
+	 * preallocated "IOstagingSlot" */
+	int saturation = saturationForStaging(output);
+	
+	/* if our vector is less than half full, it is lighter to save it as a sparseRIV */
+	if( saturation < RIVSIZE/2){
+		
+		FILE *lexWord = fopen(pathString, "wb");
+		if(!lexWord){
+			fprintf(stderr,"lexicon push has failed for word: %s\n", output->name);
+			return 1;
+		}
+		/* IOstagingSlot is formatted for immediate writing */
+		fwrite(IOstagingSlot, (saturation*2)+5, sizeof(int), lexWord);
+		fclose(lexWord);
+	}else{
+		/* the "cached" datapoint will be erased, a typecheck flag (0) for
+		 * the fLexPull function to know that this is a denseVector put 
+		 * in its place */
+		output->cached = 0;
+		FILE *lexWord = fopen(pathString, "wb");
+		if(!lexWord){
+			fprintf(stderr, "lexicon push has failed for word: %s\n", output->name);
+			return 1;
+		}
+		/* from the type flag forward, all metadata is preformatted, we simpy write */
+		fwrite(((int*)&output->cached), sizeof(int), RIVSIZE+5, lexWord);
+		
+		fclose(lexWord);
+	}
+	
+	/* and free the memory */
+	free(output);
+
+	return 0;
+}
+
+denseRIV* fLexPull(FILE* lexWord){
+	denseRIV *output = calloc(1,sizeof(denseRIV));
+	size_t typeCheck;
+	/* the first 8 byte value in the file will be either 0 (indicating storage as a dense vector)
+	 * or a positive number, the number of values in a sparse-vector */
+	if(!fread(&typeCheck, 1, sizeof(size_t), lexWord)){
+		return NULL;
+	}
+	
+	/* first value stored is the value count if sparse, and 0 if dense */
+	if (typeCheck){ /* pull as sparseVector */
+		
+		/*create a sparseVector pointer, pointing to a prealloccated slot */
+		sparseRIV* temp = (sparseRIV*)RIVKey.h_tempBlock;
+		/* typecheck, non-zero, is the number of values in our vector */
+		temp->count = typeCheck;
+		/* locations slot comes immediately after the magnitude */
+		temp->locations = (int*)&(temp->magnitude) + 1;
+		/* and values slot comes immediately after locations */
+		temp->values = temp->locations+temp->count;		
+		
+		if (fread(&(temp->frequency), sizeof(int), (typeCheck* 2)+3, lexWord) != typeCheck*2 + 3){
+			printf("vector read failure");
+			return NULL;
+		}
+		
+		/* add our temporary sparseVector to the empty denseVector, for output */
+		addS2D(output->values, *temp);
+	}else{ /* typecheck is thrown away, just a flag in this case */
+	
+		/*  read into our denseVector pre-formatted to fit */
+		if(fread(&output->frequency, sizeof(int), RIVSIZE+3, lexWord) != RIVSIZE+3){
+			printf("vector read failure");
+			return NULL;
+		}
+	}
+
+	return output;
+}
+/* if our data is cached, it cannot be allowed to be lost in event of an issue */
+void signalSecure(int signum, siginfo_t *si, void* arg){
+	/* descend linked list */
+	while(rootCache){
+		/* dumping all caches contained */
+		if(cacheDump(rootCache->cache)){
+			fprintf(stderr, "cache dump failed, some lexicon data lost");
+		}
+		rootCache = rootCache->next;
+		
+	}
+	/* end with normal behavior of error */
+	signal(signum, SIG_DFL);
+	kill(getpid(), signum);
+}
+int cacheDump(denseRIV* *toDump){
+	/* flag will record if there are any errors and alert */
+	int flag = 0;
+	
+	/* iterate through the elements of our cache */
+	denseRIV* *toDump_slider = toDump;
+	denseRIV* *toDump_stop = toDump+CACHESIZE;
+	while(toDump_slider<toDump_stop){
+		#ifdef HASHCACHE
+		/* if our cache is hashed, there may be null vectors to be skipped */
+		if(*toDump_slider){
+
+			flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
+		}
+		#else /* HASHCAVHE */
+		#ifdef SORTCACHE
+		/* if our cache is sorted, a null vector represents the end of the cache */
+		if(!*toDump_slider)break;
+
+		flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
+		#endif /* SORTCACHE */
+		#endif
+		
+		toDump_slider++;
+	}
+	
+	free(toDump);
+	
+	return flag;
+}
+#endif /* RIV_LEXICON_H */
--- a/RIVlexicon.h.gch
+++ b/RIVlexicon.h.gch
--- a/RIVread.c
+++ b/RIVread.c
@@ -6,10 +6,11 @@
 #include <dirent.h>
 #include <error.h>
 #include <string.h>
-//#define HASHCACHE
+
 #define RIVSIZE 50000
 #define NONZEROS 4
-#define CACHESIZE 27000
+#define CACHESIZE 25000
+#define SORTCACHE
 #include "RIVtools.h"

 //this program reads a directory full of files, and adds all context vectors (considering file as context)
@@ -20,11 +21,11 @@ void addContext(denseRIV* lexRIV, sparseRIV context);
 void directoryGrind(char *rootString);
 void lineGrind(char* textLine);
 LEXICON* lp;
-//int COUNTY = 0;
+int COUNTY = 0;
 int main(int argc, char *argv[]){

 	char pathString[1000];
-	lp = lexOpen("lexicon", "rw");
+	lp = lexOpen("lexiconshitty", "r");
 	//we open the lexicon, if it does not yet exist, it will be created
 	
 	
@@ -33,7 +34,6 @@ int main(int argc, char *argv[]){
 	strcpy(pathString, argv[1]);
 	strcat(pathString, "/");
 	//ensure that the targeted root directory exists
-	
 	struct stat st;
 	if(stat(pathString, &st) == -1) {
 		printf("directory doesn't seem to exist");
@@ -79,8 +79,10 @@ void directoryGrind(char *rootString){
 		//open a file within root directory
 		FILE *input = fopen(pathString, "r");
 		if(input){
+			if(COUNTY++>1000) return;
 			//process this file and add it's data to lexicon
 			//fprintf(stderr, "***%d", COUNTY++);
+
 			fileGrind(input);
 			
 			fclose(input);
@@ -133,7 +135,10 @@ void lineGrind(char* textLine){
 		//we pull the vector corresponding to each word from the lexicon
 		//if it's a new word, lexPull returns a 0 vector
 		lexiconRIV= lexPull(lp, word);
-
+		if(!lexiconRIV){
+			printf("Fuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuucked");
+			continue;
+		}
 		//we add the context of this file to this wordVector
 		addContext(lexiconRIV, contextVector);
 		
@@ -150,20 +155,13 @@ void lineGrind(char* textLine){
 	}
 	//free the heap allocated context vector data
 	free(contextVector.locations);
-	
-	
-	
-	
-	
-	
-	
-	
 }

 void addContext(denseRIV* lexRIV, sparseRIV context){
 		
 		//add context to the lexRIV, (using sparse-dense vector comparison)
-		addS2D(lexRIV->values, context);
+		sparseRIV thing = context;
+		addS2D(lexRIV->values, thing);
 		
 		//log the "size" of the vector which was added
 		//this is not directly necessary, but is useful metadata for some analises

--- a/runscriptUb.sh
+++ b/runscriptUb.sh
-clean(){
-	while [ "$1" ]; do
-		
-		./RIVread "$1"
-		
-		shift
-	done
-}
-
-clean ../bookCleaner/cleanbooks/*
--- a/saturation.c
+++ b/saturation.c
-#include <stdio.h>
-#include <stdlib.h>
-#include <dirent.h>
-#include <time.h>
-#include "RIVtoolsCPUlinux.h"
-void directoryToL2s(char *rootString);
-int main(){
-	RIVInit();
-	char rootString[] = "lexicon/";
-	directoryToL2s(rootString);
-	
-	
-	
-	
-	
-}
-void directoryToL2s(char *rootString){
-	sparseRIV fileRIV;
-	char pathString[2000];
-	DIR *directory;
-    struct dirent *files = 0;
-	
-	if(!(directory = opendir(rootString))){
-		printf("location not found, %s\n", rootString);
-		return;
-	}
-	
-	while((files=readdir(directory))){
-		if(*(files->d_name) == '.') continue;
-	
-		if(files->d_type == DT_DIR){
-			strcpy(pathString, rootString);
-			
-			strcat(pathString, files->d_name);
-			strcat(pathString, "/");
-			directoryToL2s(pathString);
-		}
-			
-
-		strcpy(pathString, rootString);
-		strcat(pathString, files->d_name);
-		FILE *input = fopen(pathString, "r");
-		if(!input){
-			printf("file %s doesn't seem to exist, breaking out of loop", pathString);
-			return;
-		}else{
-			denseRIV temp = lexPull(pathString);
-			fileRIV = consolidateD2S(temp.values);
-			strcpy(fileRIV.name, pathString);
-			float count = fileRIV.count;
-			printf("%s, saturation: %f\n", fileRIV.name, count);
-			fclose(input);
-			free(temp.values);
-			//free(fileRIV.locations);
-		}
-	}
-}
--- a/someshit.c
+++ b/someshit.c
+#include <stdio.h>
+#include "RIVaccessories.h"
+#include <time.h>
+int main(){
+	struct treenode* root = stemTreeSetup();
+	char word[100];
+	char* stem;
+	clock_t start, end;
+	puts("tree ready");
+	while(1){
+		scanf("%s", word);
+		start = clock();
+		stem = treeSearch(root, word) ;
+		end = clock();
+		
+		if(stem){
+			puts(stem);
+		}else{
+			puts("no entry");
+		}
+		printf("took: %lf\n", (double)(end-start)/CLOCKS_PER_SEC);
+	}
+}
--- a/stemconfig/dbtools.py
+++ b/stemconfig/dbtools.py
+import pymongo
+from pymongo import MongoClient
+
+
+def dbSetup():
+    client = MongoClient("mongodb://etcart:Argelfraster1@ds261969.mlab.com:61969/rivwordnet")
+    database = client.rivwordnet
+    collection = database.stems
+
+    collection.create_index("from")
+    return collection
+
+
+def dbPost(wordset, collection):
+    if not len(wordset):
+        return
+
+    posts = []
+    for key, value in wordset.iteritems():
+        post = {"from": key, "to": value}
+        posts.append(post)
+
+    collection.insert_many(posts)
+
+def cleanDbSetup():
+    client = MongoClient("mongodb://etcart:Argelfraster1@ds163119.mlab.com:63119/rivetcleandocs")
+    database = client.rivetcleandocs
+    collection = database.cleaned
+    collection.create_index("file")
+    return collection
+
+def dbPostCleaned(text, file, collection):
+    if not len(text):
+        return
+    document = {
+        "text": text,
+        "file": file,
+    }
+    collection.insert_one(document)
+
+
+
+def dbGet(words, collection):
+
+
+
+    if mebewords:
+        return mebeword["to"]
+    else:
+        return 0
\ No newline at end of file
--- a/stemconfig/dbtools.pyc
+++ b/stemconfig/dbtools.pyc
--- a/stemconfig/stemconf
+++ b/stemconfig/stemconf
--- a/stemconfig/stemconf.c
+++ b/stemconfig/stemconf.c
+#include <stdio.h>
+#include "../RIVaccessories.h"
+int configInsert(struct treenode* node, char* letter, int treeSize);
+int stemTreeConfig();
+int main(){
+	int count = stemTreeConfig();
+	printf("%d", count);
+	
+}
+
+int configInsert(struct treenode* node, char* letter, int treeSize){
+	
+	node->downstream++;
+	if(*(letter)){
+		if(!node->links[*(letter)-'a']){
+			treeSize++;
+			node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
+			
+		}
+		return configInsert(node->links[*(letter)-'a'], letter+1, treeSize);
+	}else{
+		return treeSize;
+	}
+}
+
+int stemTreeConfig(){
+	int treeSize = 1;
+	FILE* wordFile = fopen("wordset.txt", "r");
+	if(!wordFile){
+		printf("no wordnet file");
+		return 0;
+	}
+	
+	struct treenode* rootNode = calloc(1, sizeof(struct treenode));
+	char word[100];
+	char* stem = (char*)stemset;
+	int displacement;
+	while(fscanf(wordFile, "%s", word)){
+		sscanf(stem, "%*s%n", &displacement);
+		stem[displacement] = '\0';
+		
+		
+		treeSize = configInsert(rootNode, word, treeSize);
+		if(feof(wordFile)){
+			break;
+		}
+		stem += displacement+1;
+	}
+	fclose(wordFile);
+	return treeSize;
+}
--- a/stemconfig/stemconf.o
+++ b/stemconfig/stemconf.o
--- a/stemconfig/stemconfig
+++ b/stemconfig/stemconfig
--- a/stemconfig/stemconfig.c
+++ b/stemconfig/stemconfig.c
+#include <stdio.h>
+#include "../RIVaccessories.h"
+int main(){
+	int count = stemTreeConfig();
+	printf("%d", count);
+	
+}
--- a/stemconfig/stemconfig.py
+++ b/stemconfig/stemconfig.py
+import dbtools
+from subprocess import call
+
+collection = dbtools.dbSetup()
+
+preset = collection.find()
+set = {}
+for doc in preset:
+	set[doc["from"]] = doc["to"]
+words = [];
+stems = [];
+for key, value in set.iteritems():
+	words.append(key);
+	stems.append(value);
+	
+wordFILE = open("wordset.txt", "w")
+wordFILE.write(' '.join(words));
+wordFILE.close()
+stemFILE = open("stemset.h", "w")
+finalOut = 'char stemset[] = "' + ' '.join(stems) + ' ";'+'\nint treesize = '
+
+stemFILE.write(finalOut + '0;')
+
+stemFILE.close()
+
+tempfile = open("tempfile.txt", "w")
+call(["gcc", "stemconf.c","-o", "stemconfig"])
+call(["./stemconfig"], stdout=tempfile)
+tempfile.close()
+tempfile = open("tempfile.txt", "r")
+treesize = tempfile.read();
+finalOut = finalOut + treesize + ';'
+stemFile = open("stemset.h", "w")
+stemFile.write(finalOut)
+stemFile.close;
+
--- a/stemconfig/stemset.h
+++ b/stemconfig/stemset.h
--- a/stemconfig/tempfile.txt
+++ b/stemconfig/tempfile.txt
+279920
\ No newline at end of file
--- a/stemconfig/wordset.txt
+++ b/stemconfig/wordset.txt
--- a/stemnet2.txt
+++ b/stemnet2.txt
--- a/treetest.c
+++ b/treetest.c
+#include <stdio.h>
+#include "RIVtools.h"
+
+
+
+int main(){
+	
+	struct treenode* root = stemTreeSetup();
+	char word[100];
+	char* stem;
+	while(1){
+		while(*word != '1'){
+			
+			scanf("%s", word);
+			stem = treeSearch(root, word);
+			if(stem){
+				puts(stem);
+			}else{
+				puts("NULL return");
+			}
+			
+			
+		}
+		while(*word != '0'){
+			scanf("%s", word);
+			treecut(root, word);
+			
+			
+		}
+		
+	}
+	
+	
+	
+	return 0;
+	
+}