secured push against cache

ad4b27c9 · etcart · 9fd65b3a · ad4b27c9 · ad4b27c9 · 9fd65b3a
Commit ad4b27c9 authored Apr 25, 2018 by etcart
Showing with 515 additions and 108 deletions
RIVLower.h
RIVLower.h.gch
RIVLower.h.save
RIVclasses
RIVclasses.c
RIVclasses.o
RIVconsolidate.c
RIVcull
RIVcull.c
RIVgraphout
RIVgraphout.c
RIVgraphout.o
RIVlexicon.h
RIVlexicon.h.gch
RIVread
RIVread.c
RIVread.o
RIVtools.h.gch
graphdata.txt
output.txt
--- a/RIVLower.h
+++ b/RIVLower.h
@@ -7,6 +7,7 @@
 #include <unistd.h>
 #include <sys/stat.h>
 #include "RIVaccessories.h"
+#include "assert.h"
 /* RIVSIZE macro defines the dimensionality off the RIVs we will use
 * 25000 is the standard, but can be redefined specifically
 */
@@ -14,8 +15,8 @@
 #define RIVSIZE 25000
 #endif
-#if RIVSIZE<0
+#if RIVSIZE<4
-#error "RIVSIZE must be a positive number (preferably a large positive)"
+#error "RIVSIZE must be a positive number, greater than 4 (preferably a large positive)"
 #endif
 /* NONZeros macro defines the number of non-zero values that will be generated
@@ -36,7 +37,7 @@
 * that do not use lexpull/push
 */
 #ifndef CACHESIZE
-#define CACHESIZE 5000
+#define CACHESIZE 10000
 #endif
 #if CACHESIZE<0
@@ -57,10 +58,10 @@ typedef struct{
 	char name[100];
 	int *values;
 	int *locations;
-	size_t count;
+	int count;
-	float magnitude;
-	int contextSize;
 	int frequency;
+	int contextSize;
+	float magnitude;
 }sparseRIV;
 /* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
 * this is rarely the case, but its primary use is for performing vector
@@ -68,11 +69,11 @@ typedef struct{
 * performed between sparse and dense (hetero-arithmetic)
 */
 typedef struct{
-	int cached;
 	char name[100];
+	int cached;
 	int frequency;
-	float magnitude;
 	int contextSize;
+	float magnitude;
 	int values[RIVSIZE];
 }denseRIV;
@@ -99,13 +100,13 @@ sparseRIV consolidateD2S(int *denseInput);  //#TODO fix int*/denseRIV confusion
 * this produces an "implicit" RIV which can be used with the mapI2D function
 * to create a denseRIV.
 */
-void makeSparseLocations(char* word,  int *seeds, size_t seedCount);
+void makeSparseLocations(char* word,  int *seeds, int seedCount);
 /* mapI2D maps an "implicit RIV" that is, an array of index values, 
 * arranged by chronological order of generation (as per makesparseLocations)
 * it assigns, in the process of mapping, values according to ordering
 */
-int* mapI2D(int *locations, size_t seedCount);
+int* mapI2D(int *locations, int seedCount);
 /* highly optimized method for adding vectors.  there is no method 
 * included for adding D2D or S2S, as this system is faster-enough
@@ -121,7 +122,7 @@ int cacheDump();
 /* adds all elements of an implicit RIV (a sparseRIV represented without values)
 * to a denseRIV.  used by the file2L2 functions in aggregating a document vector
 */
-int* addI2D(int* destination, int* locations, size_t seedCount);
+int* addI2D(int* destination, int* locations, int seedCount);
 /*subtracts a words vector from its own context.  regularly used in lex building
 */
@@ -136,6 +137,7 @@ int* addS2D(int* destination, sparseRIV input){// #TODO fix destination paramete
 	/* apply values at an index based on locations */
 	while(locations_slider<locations_stop){
 		destination[*locations_slider] += *values_slider;
 		locations_slider++;
 		values_slider++;
@@ -144,7 +146,7 @@ int* addS2D(int* destination, sparseRIV input){// #TODO fix destination paramete
 	return destination;
 }
-int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
+int* mapI2D(int *locations, int valueCount){// #TODO fix destination parameter vs calloc of destination
 	int *destination = (int*)calloc(RIVSIZE,sizeof(int));
 	int *locations_slider = locations;
 	int *locations_stop = locations_slider+valueCount;
@@ -160,7 +162,7 @@ int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination paramete
 	return destination;
 }
-int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
+int* addI2D(int* destination, int *locations, int valueCount){// #TODO fix destination parameter vs calloc of destination
 	int *locations_slider = locations;
 	int *locations_stop = locations_slider+valueCount;
@@ -203,6 +205,7 @@ sparseRIV consolidateD2S(int *denseInput){
 		}
 	}
 	/* a slot is opened for the locations/values pair */
 	output.locations = (int*) malloc(output.count*2*sizeof(int));
 	if(!output.locations){
 		printf("memory allocation failed"); //*TODO enable fail point knowledge and security
@@ -220,7 +223,7 @@ sparseRIV consolidateD2S(int *denseInput){
-void makeSparseLocations(char* word,  int *locations, size_t count){
+void makeSparseLocations(char* word,  int *locations, int count){
 	locations+=count;
 	srand(wordtoSeed(word));
 	int *locations_stop = locations+NONZEROS;

--- a/RIVLower.h.gch
+++ b/RIVLower.h.gch
--- a/RIVLower.h.save
+++ b/RIVLower.h.save
--- a/RIVclasses
+++ b/RIVclasses
--- a/RIVclasses.c
+++ b/RIVclasses.c
+#include <stdio.h>
+#define CACHESIZE 0
+#define CACHEEXCLUSIVE 1
+#define RIVSIZE 50000
+#include "RIVtools.h"
+char* stem(char* word);
+int main(){
+	lexOpen("consolidatedLexicon50-8");
+	FILE* text = fopen("../books/pg56902.txt", "r");
+	if(!text){
+		puts("no file");
+		return 1;
+	}
+	denseRIV accumulate = {0};
+	sparseRIV temp;
+	char word[100];
+	while(fscanf(text, "%99s", word)){
+		if(feof(text)) break;
+		if(!*word) break;
+		if(stem(word)){
+			denseRIV* wordRIV = lexPull(word);
+			if(!wordRIV){
+				printf("%s, not in lexicon\n", word);
+				continue;
+			}else{
+				temp = consolidateD2S(wordRIV->values);
+				addS2D(accumulate.values, temp);
+				free(temp.locations);
+				free(wordRIV);
+			}
+		}else{
+			printf("%s, not in wordNet\n", word);
+		}
+	}
+	return 0;
+}
+char* stem(char* word){
+	char pathString[200];
+	int WNdata;
+	sprintf(pathString, "WN/%s", word);
+	FILE* WNfile = fopen(pathString, "r");
+	if(!WNfile) return NULL;
+	fscanf(WNfile, "%d", &WNdata);
+	if(!WNdata) return NULL;
+	if(WNdata == 1) return word;
+	if(WNdata == 2){
+		fscanf(WNfile, "%s", word);
+		fclose(WNfile);
+		sprintf(pathString, "WN/%s", word);
+		WNfile = fopen(pathString, "r");
+		if(!WNfile) return NULL;
+		fscanf(WNfile, "%*d%s", word);
+		return word;
+	}
+	return NULL;
+}
--- a/RIVclasses.o
+++ b/RIVclasses.o
--- a/RIVconsolidate.c
+++ b/RIVconsolidate.c
+#include <stdio.h>
+#define RIVSIZE 50000
+#define CACHESIZE 0
+#include "RIVtools.h"
+#include <dirent.h>
+int main(int argc, char* argv[]){
+	lexOpen(argv[1]);
+	denseRIV* intake;
+	sparseRIV examine;
+	static denseRIV *output[60000] = {0};
+	DIR *directory;
+    struct dirent *files = 0;
+	if(!(directory = opendir(argv[1]))){
+		printf("location not found, %s\n", argv[1]);
+		return 1;
+	}
+	int i=0;
+	int j=0;
+	while((files=readdir(directory))){
+		if(*(files->d_name) == '.') continue;
+		if(files->d_type == DT_DIR){
+			/* the lexicon should not have valid sub-directories */
+			continue;
+		}
+		j++;
+		intake = lexPull(files->d_name);
+		/* if the vector has been encountered more than MINSIZE times
+		 * then it should be statistically significant, and useful */
+		if(intake->contextSize<7000){
+			free(intake);
+			continue;
+		}
+		examine = normalize(*intake, 10000);
+		strcpy(examine.name, files->d_name);
+		printf("%d,%d,%lf,%d,%d\n", examine.frequency, examine.contextSize, examine.magnitude, i, j);
+		output[i] = calloc(1, sizeof(denseRIV));
+		addS2D(output[i]->values, examine);
+		output[i]->magnitude = examine.magnitude;
+		strcpy(output[i]->name, files->d_name);
+		output[i]->frequency = intake->frequency;
+		output[i]->contextSize = intake->contextSize;
+		free(intake);
+		free(examine.locations);
+		i++;
+	}
+	lexClose();
+	lexOpen("consolidatedLexicon50-8");
+	for(int j=0; j<i; j++){
+		lexPush(output[j]);
+	}
+	lexClose();
+	return 0;
+}
--- a/RIVcull
+++ b/RIVcull
--- a/RIVcull.c
+++ b/RIVcull.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <time.h>
+#include "RIVtools.h"
+#define THRESHOLD 0.70
+/* this program identifies all near-duplicates among the documents in the 
+ * chosen root directory, using RIV comparison */
+// fills the fileRIVs array with a vector for each file in the root directory
+void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
+int main(int argc, char *argv[]){
+	int fileCount = 0;
+	//initializes the fileRIVs array to be reallocced by later function
+	sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
+	char rootString[2000];
+	if(argc <2){ 
+		printf("give me a directory");
+		return 1;
+	}
+	strcpy(rootString, argv[1]);
+	strcat(rootString, "/");
+	//gather all vectors ino the fileRIVs array and count them in fileCount
+	directoryToL2s(rootString, &fileRIVs, &fileCount);
+	printf("fileCount: %d\n", fileCount);
+	//first calculate all magnitudes for later use
+	for(int i = 0; i < fileCount; i++){
+		fileRIVs[i].magnitude = getMagnitudeSparse(fileRIVs[i]);
+	}
+	clock_t begintotal = clock();
+	double cosine;
+	double minmag;
+	double maxmag;
+	//all cosines need a sparse-dense comparison.  so we will create a 
+	denseRIV baseDense;
+	for(int i = 0; i < fileCount; i++){
+		//0 out the denseVector, and map the next sparseVector to it
+		memset(&baseDense, 0, sizeof(denseRIV));
+		addS2D(baseDense.values, fileRIVs[i]);
+		//pass magnitude to the to the dense vector
+		baseDense.magnitude = fileRIVs[i].magnitude;
+		//if these two vectors are too different in size, we can know that they are not duplicates
+		minmag = baseDense.magnitude*.85;
+		maxmag  = baseDense.magnitude*1.15;
+		for(int j = 0; j < i; j++){
+			//if this vector is within magnitude threshold
+			if(fileRIVs[j].magnitude < maxmag 
+			&& fileRIVs[j].magnitude > minmag){
+				//identify the similarity of these two vectors
+				cosine = cosCompare(baseDense, fileRIVs[j]);
+				//if the two are similar enough to be flagged
+				if(cosine>THRESHOLD){
+					printf("%s\t%s\n%f\n", fileRIVs[i].name , fileRIVs[j].name, cosine);
+				}	
+			}
+		}
+	}
+	printf("fileCount: %d", fileCount);
+	free(fileRIVs);
+	clock_t endtotal = clock();
+	double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
+	printf("total time:%lf\n\n", time_spent);
+return 0;
+}
+//mostly a standard recursive Dirent-walk
+void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
+/* *** begin Dirent walk *** */
+	char pathString[2000];
+	DIR *directory;
+	struct dirent *files = 0;
+	if(!(directory = opendir(rootString))){
+		printf("location not found, %s\n", rootString);
+		return;
+	}
+	while((files=readdir(directory))){
+		if(!files->d_name[0]) break;
+		while(*(files->d_name)=='.'){
+			files = readdir(directory);
+		}
+		if(files->d_type == DT_DIR){
+			strcpy(pathString, rootString);
+			strcat(pathString, files->d_name);
+			strcat(pathString, "/");
+			directoryToL2s(pathString, fileRIVs, fileCount);
+			continue;
+		}
+		strcpy(pathString, rootString);
+		strcat(pathString, files->d_name);
+/* *** end dirent walk, begin meat of function  *** */
+		FILE *input = fopen(pathString, "r");
+		if(input){
+			*fileRIVs = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
+			(*fileRIVs)[*fileCount] = fileToL2(input);
+			strcpy((*fileRIVs)[*fileCount].name, pathString);
+			fclose(input);
+			 *fileCount += 1;
+		}
+	}
+}
--- a/RIVgraphout
+++ b/RIVgraphout
--- a/RIVgraphout.c
+++ b/RIVgraphout.c
 #include <stdio.h>
-#define RIVSIZE 25000
+#define RIVSIZE 50000
 #define CACHESIZE 0
 #include "RIVtools.h"
 #include <dirent.h>
@@ -7,8 +7,6 @@
 int main(int argc, char* argv[]){
 	lexOpen(argv[1]);
 	denseRIV* intake;
-	sparseRIV examine;
-	static denseRIV *output[60000] = {0};
 	DIR *directory;
    struct dirent *files = 0;
@@ -28,27 +26,15 @@ int main(int argc, char* argv[]){
 		intake = lexPull(files->d_name);
 		/* if the vector has been encountered more than MINSIZE times
 		 * then it should be statistically significant, and useful */
-		if(intake->contextSize<10000)continue;
-		examine = normalize(*intake, 500);
-		strcpy(examine.name, files->d_name);
-		printf("%d,%d,%lf,%s\n", examine.frequency, examine.contextSize, examine.magnitude, examine.name);
-		output[i] = calloc(1, sizeof(denseRIV));
-		addS2D(output[i]->values, examine);
-		output[i]->magnitude = examine.magnitude;
-		strcpy(output[i]->name, files->d_name);
-		output[i]->frequency = intake->frequency;
-		free(intake);
-		free(examine.locations);
-		i++;
-	}
-	lexClose();
-	/*lexOpen("consolidatedLexiconAggressive");
-	for(int j=0; j<i; j++){
-		lexPush(output[j]);
+		printf("%d,%d,%lf,%d,%s\n", intake->frequency, intake->contextSize, intake->magnitude, i, files->d_name);
+		free(intake);
+		i++;
 	}
-	lexClose();*/
+	lexClose();
 	return 0;
 }
--- a/RIVgraphout.o
+++ b/RIVgraphout.o
--- a/RIVlexicon.h
+++ b/RIVlexicon.h
--- a/RIVlexicon.h.gch
+++ b/RIVlexicon.h.gch
--- a/RIVread
+++ b/RIVread
--- a/RIVread.c
+++ b/RIVread.c
@@ -6,6 +6,7 @@
 #include <dirent.h>
 #include <error.h>
 #include <string.h>
+#define CACHESIZE 100000
 #include "RIVtools.h"
 //this program reads a directory full of files, and adds all context vectors (considering file as context)
@@ -17,6 +18,7 @@ void directoryGrind(char *rootString);
 void lineGrind(char* textLine);
 int main(int argc, char *argv[]){
 	char pathString[1000];
 	//we open the lexicon, if it does not yet exist, it will be created
@@ -69,7 +71,7 @@ void directoryGrind(char *rootString){
 			printf("skipped: %s\n", files->d_name); 
 			continue;
 		}
+		puts(files->d_name);
 		//open a file within root directory
 		FILE *input = fopen(pathString, "r");
 		if(input){
@@ -83,11 +85,11 @@ void directoryGrind(char *rootString){
 void fileGrind(FILE* textFile){
-	char textLine[5000];
+	char textLine[10000];
 	// included python script separates paragraphs into lines
+	int i=0;
-	while(fgets(textLine, 4999, textFile)){
+	while(fgets(textLine, 9999, textFile)){
+		printf("line: %d\n", i++);
 		if(!strlen(textLine)) continue;
 		if(feof(textFile)) break;
@@ -100,7 +102,11 @@ void fileGrind(FILE* textFile){
 void lineGrind(char* textLine){
 	//extract a context vector from this text set
 	sparseRIV contextVector = textToL2(textLine);
+	if(contextVector.contextSize <= 1){
+		free(contextVector.locations);
+		return;
+	}
 	denseRIV* lexiconRIV;
 	//identify stopping point in line read
 	char* textEnd = textLine + strlen(textLine)-1;
@@ -110,6 +116,7 @@ void lineGrind(char* textLine){
 		sscanf(textLine, "%99s%n", word, &displacement);
 		//we ensure that each word exists, and is free of unwanted characters
+		textLine += displacement+1;
 		if(!(*word))continue;
 		if(!isWordClean((char*)word)){
@@ -132,7 +139,7 @@ void lineGrind(char* textLine){
 		//and finally we push it back to the lexicon for permanent storage
 		lexPush(lexiconRIV);
-		textLine += displacement+1;
 	}
 	//free the heap allocated context vector data

--- a/RIVread.o
+++ b/RIVread.o
--- a/RIVtools.h.gch
+++ b/RIVtools.h.gch
--- a/graphdata.txt
+++ b/graphdata.txt
--- a/output.txt
+++ b/output.txt
--- a/test.py
+++ b/test.py
 import numpy as np
 import matplotlib.pyplot as plt
+import math
+def fit(x):
+    return 1*(1067+94500000/x)
+x = 7
+range = 0.15
+while(1):
+    range = input("gimmerange");
+    data = open("graphdata.txt", "r");
+    frequencies = [];
+    mags = [];
+    fitline = [];
+    i = 0;
+    for line in data:
+        segments = line.split(",")
+        freq = int(segments[1])
+        mag = float(segments[2])
+        name = segments[4];
+        if(freq>40000):
+             continue;
+        core = fit(freq)
+        fitmax = core*(1+range);
+        fitmin = core*(1-range);
+        if(mag >fitmax or mag < fitmin):
+            continue
+        frequencies.append(freq)
+        mags.append(mag)
+	fitline.append(fit(freq));
+	print("{} {} {}".format(name, freq, mag))
+        i+=1
-data = open("../code/RIVet/graphdata.txt", "r");
-frequencies = [];
+    #plt.scatter(frequencies, mags)
-mags = [];
+    plt.plot(frequencies, fitline, 'r^', frequencies, mags, 'bs')
-i = 0;
+    plt.show()
-for line in data:
+    x+=1
-    if(int(line.split(",")[1])>40000):
-         continue;
-    frequencies.append(int(line.split(",")[1]))
-    mags.append(float(line.split(",")[2]))
-    if(mags[i]>80 and frequencies[i]>7000 and frequencies[i]<15000):
-        print(line)
-    i+=1
-plt.scatter(frequencies, mags)
-plt.show()