Commit ad4b27c9 by etcart

secured push against cache

parent 9fd65b3a
......@@ -7,6 +7,7 @@
#include <unistd.h>
#include <sys/stat.h>
#include "RIVaccessories.h"
#include "assert.h"
/* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically
*/
......@@ -14,8 +15,8 @@
#define RIVSIZE 25000
#endif
#if RIVSIZE<0
#error "RIVSIZE must be a positive number (preferably a large positive)"
#if RIVSIZE<4
#error "RIVSIZE must be a positive number, greater than 4 (preferably a large positive)"
#endif
/* NONZeros macro defines the number of non-zero values that will be generated
......@@ -36,7 +37,7 @@
* that do not use lexpull/push
*/
#ifndef CACHESIZE
#define CACHESIZE 5000
#define CACHESIZE 10000
#endif
#if CACHESIZE<0
......@@ -57,10 +58,10 @@ typedef struct{
char name[100];
int *values;
int *locations;
size_t count;
float magnitude;
int contextSize;
int count;
int frequency;
int contextSize;
float magnitude;
}sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector
......@@ -68,11 +69,11 @@ typedef struct{
* performed between sparse and dense (hetero-arithmetic)
*/
typedef struct{
int cached;
char name[100];
int cached;
int frequency;
float magnitude;
int contextSize;
float magnitude;
int values[RIVSIZE];
}denseRIV;
......@@ -99,13 +100,13 @@ sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
* this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV.
*/
void makeSparseLocations(char* word, int *seeds, size_t seedCount);
void makeSparseLocations(char* word, int *seeds, int seedCount);
/* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering
*/
int* mapI2D(int *locations, size_t seedCount);
int* mapI2D(int *locations, int seedCount);
/* highly optimized method for adding vectors. there is no method
* included for adding D2D or S2S, as this system is faster-enough
......@@ -121,7 +122,7 @@ int cacheDump();
/* adds all elements of an implicit RIV (a sparseRIV represented without values)
* to a denseRIV. used by the file2L2 functions in aggregating a document vector
*/
int* addI2D(int* destination, int* locations, size_t seedCount);
int* addI2D(int* destination, int* locations, int seedCount);
/*subtracts a words vector from its own context. regularly used in lex building
*/
......@@ -136,6 +137,7 @@ int* addS2D(int* destination, sparseRIV input){// #TODO fix destination paramete
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++;
values_slider++;
......@@ -144,7 +146,7 @@ int* addS2D(int* destination, sparseRIV input){// #TODO fix destination paramete
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int* mapI2D(int *locations, int valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
......@@ -160,7 +162,7 @@ int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination paramete
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int* addI2D(int* destination, int *locations, int valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
......@@ -203,6 +205,7 @@ sparseRIV consolidateD2S(int *denseInput){
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge and security
......@@ -220,7 +223,7 @@ sparseRIV consolidateD2S(int *denseInput){
void makeSparseLocations(char* word, int *locations, size_t count){
void makeSparseLocations(char* word, int *locations, int count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
......
No preview for this file type
#ifndef RIVLOWER_H_
#define RIVLOWER_H_
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <math.h>
#include <sys/stat.h>
#include <sys/types.h>
/* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically
*/
#ifndef RIVSIZE
#define RIVSIZE 25000
#endif
#if RIVSIZE<0
#error "RIVSIZE must be a positive number (preferably a large positive)"
#endif
/* NONZeros macro defines the number of non-zero values that will be generated
* for any level one (barcode) RIV. 2 is simple and lightweight to begin
*/
#ifndef NONZEROS
#define NONZEROS 2
#endif
#if NONZEROS%2 || NONZEROS<1
#error "NONZEROS must be an even, greater than 0 number"
#endif
/* CACHESIZE macro defines the number of RIVs the system will cache.
* a larger cache means more memory consumption, but will also be significantly
* faster in aggregation and reading applications. doesn't affect systems
* that do not use lexpull/push
*/
#ifndef CACHESIZE
#define CACHESIZE 20
#endif
#if CACHESIZE<0
#error "CACHESIZE cannot be a negative number"
#endif
/* the size of the tempBlock used in consolidation and implicit RIVs */
#define TEMPSIZE 3*RIVSIZE
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation.
* specifically, a sparseRIV contains a pair of arrays,
* containing locations and values, where pairs are found in like array
* indices.
*/
typedef struct{
char name[100];
int *values;
int *locations;
size_t count;
int frequency;
double magnitude;
int boolean;
int contextSize;
}sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector
* math, as comparisons and arithmetic between vectors are ideally
* performed between sparse and dense (hetero-arithmetic)
*/
typedef struct{
char name[100];
int* values;
int* frequency;
double magnitude;
int cached;
int *contextSize;
}denseRIV;
/*RIVKey, holds global variables used under the hood, primarily for the lexicon
* it also holds a "temp block" that will be used by the dense to sparse
* conversion and implicit RIV aggregation
*/
struct RIVData{
int h_tempBlock[TEMPSIZE];
int tempSize;
char lexName[255];
denseRIV RIVCache[CACHESIZE];
}static RIVKey;
/* lexOpen is called to "open the lexicon", setting up for later calls to
* lexPush and lexPull. if the lexicon has not been opened before calls
* to these functions, their behavior can be unpredictable, most likely crashing
*/
void lexOpen();
/* lexClose should always be called after the last lex push or lex pull call
* if the lexicon is left open, some vector data may be lost due to
* un-flushed RIV cache
*/
void lexClose();
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does not automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't
* contain any metadata
*/
sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
/* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file
* this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV.
*/
void makeSparseLocations(unsigned char* word, int *seeds, size_t seedCount);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexPush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int fLexPush(denseRIV RIVout);
/* flexPull pulls data directly from a file and converts it (if necessary)
* to a denseRIV. function is called by "lexPull" which is what users
* should actually use. lexPull, unlike FlexPull, has cache logic under
* the hood for speed and harddrive optimization
*/
denseRIV fLexPull(FILE* lexWord);
/* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(unsigned char* word);
/* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering
*/
int* mapI2D(int *locations, size_t seedCount);
/* highly optimized method for adding vectors. there is no method
* included for adding D2D or S2S, as this system is faster-enough
* to be more than worth using
*/
int* addS2D(int* destination, sparseRIV input);
/*
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount);
* consolidate I2S is temporarily deprecated. may be brought back.
* in tandem they are much faster, but less careful with RAM */
/* caheDump flushes the RIV cache out to relevant files, backing up all
* data. this is called by the lexClose and signalSecure functions
*/
int cacheDump();
/* adds all elements of an implicit RIV (a sparseRIV represented without values)
* to a denseRIV. used by the file2L2 functions in aggregating a document vector
*/
int* addI2D(int* destination, int* locations, size_t seedCount);
/* allocates a denseRIV filled with 0s
*/
denseRIV denseAllocate();
/* redefines signal behavior to protect cached data against seg-faults etc*/
void signalSecure(int signum);
/* begin definitions */
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = input.locations; int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
/*
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp);
free(denseTemp);
return sparseOut;
}
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
sparseRIV sparseOut;
int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
sparseOut.count = 0;
int add = 1;
int found;
for(int i=0; i<valueCount; i++){
found = 0;
for(int j=0; j<sparseOut.count; j++){
if(implicit[i] == locationsTemp[j]){
valuesTemp[i] += add;
add *= -1;
found = 1;
}
}
if(!found){
locationsTemp[sparseOut.count] = implicit[i];
valuesTemp[sparseOut.count] = add;
sparseOut.count++;
add*= -1;
}
}
sparseOut.locations = (int*)malloc(2*sparseOut.count*sizeof(int));
sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, sparseOut.count*sizeof(int));
memcpy(sparseOut.values, valuesTemp, sparseOut.count*sizeof(int));
return sparseOut;
}*/
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge and security
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
return output;
}
void lexOpen(char* lexName){
/* RIVKey.I2SThreshold = sqrt(RIVSIZE);*/ //deprecate?
struct stat st;
if (stat(lexName, &st) == -1) {
mkdir(lexName, 0777);
}
strcpy(RIVKey.lexName, lexName);
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
signal(11, signalSecure);
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE);
}
void lexClose(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
}
int wordtoSeed(unsigned char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words
* this means letters are taken as characters counted in base 32, which
* should be large enough to hold all english characters plus a few outliers
* */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE;
locations++;
*locations = rand()%RIVSIZE;
locations++;
}
return;
}
int fLexPush(denseRIV RIVout){
char pathString[200] = {0};
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
if(temp.count<(RIVSIZE/2)){
/* smaller stored as sparse vector */
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
// printf("%s, writing as sparse, frequency: %d", RIVout.name, *RIVout.frequency);
}else{
/* saturation is too high, better to store dense */
/* there's gotta be a better way to do this */
temp.count = 0;
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
// printf("%s, writing as dense, frequency: %d", RIVout.name, *RIVout.frequency);
}
fclose(lexWord);
free(RIVout.values);
free(temp.locations);
return 0;
}
denseRIV fLexPull(FILE* lexWord){
denseRIV output = denseAllocate();
size_t typeCheck;
int flag = 0;
/* get metadata for vector */
flag+= fread(&typeCheck, 1, sizeof(size_t), lexWord);
flag+= fread(output.frequency, 1, sizeof(int), lexWord);
flag+= fread(output.contextSize, 1, sizeof(int), lexWord);
flag+= fread(&(output.magnitude), 1, sizeof(float), lexWord);
/* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){
/* pull as sparseVector */
sparseRIV temp;
/* value was not 0, so it's the value count */
temp.count = typeCheck;
temp.locations = (int*)malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
flag+= fread(temp.locations, temp.count, sizeof(int), lexWord);
flag+=fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output.values, temp);
free(temp.locations);
}else{
/* typecheck is thrown away, just a flag in this case */
flag += fread(output.values, RIVSIZE, sizeof(int), lexWord);
}
output.cached = 0;
return output;
}
void signalSecure(int signum){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
exit(1);
}
int cacheDump(){
int flag = 0;
denseRIV* cache_slider = RIVKey.RIVCache;
denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE;
while(cache_slider<cache_stop){
if((*cache_slider).cached){
flag += fLexPush(*cache_slider);
}
else{
}
cache_slider++;
}
return flag;
}
denseRIV denseAllocate(){
/* allocates a 0 vector */
denseRIV output;
output.values = (int*)calloc(RIVSIZE+2, sizeof(int));
/* for compact memory use, frequency is placed immediately after values */
output.frequency = output.values+RIVSIZE;
output.contextSize = output.frequency+1;
output.magnitude = 0;
output.cached = 0;
return output;
}
/*TODO add a simplified free function*/
#endif
File added
#include <stdio.h>
#define CACHESIZE 0
#define CACHEEXCLUSIVE 1
#define RIVSIZE 50000
#include "RIVtools.h"
char* stem(char* word);
int main(){
lexOpen("consolidatedLexicon50-8");
FILE* text = fopen("../books/pg56902.txt", "r");
if(!text){
puts("no file");
return 1;
}
denseRIV accumulate = {0};
sparseRIV temp;
char word[100];
while(fscanf(text, "%99s", word)){
if(feof(text)) break;
if(!*word) break;
if(stem(word)){
denseRIV* wordRIV = lexPull(word);
if(!wordRIV){
printf("%s, not in lexicon\n", word);
continue;
}else{
temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp);
free(temp.locations);
free(wordRIV);
}
}else{
printf("%s, not in wordNet\n", word);
}
}
return 0;
}
char* stem(char* word){
char pathString[200];
int WNdata;
sprintf(pathString, "WN/%s", word);
FILE* WNfile = fopen(pathString, "r");
if(!WNfile) return NULL;
fscanf(WNfile, "%d", &WNdata);
if(!WNdata) return NULL;
if(WNdata == 1) return word;
if(WNdata == 2){
fscanf(WNfile, "%s", word);
fclose(WNfile);
sprintf(pathString, "WN/%s", word);
WNfile = fopen(pathString, "r");
if(!WNfile) return NULL;
fscanf(WNfile, "%*d%s", word);
return word;
}
return NULL;
}
File added
#include <stdio.h>
#define RIVSIZE 50000
#define CACHESIZE 0
#include "RIVtools.h"
#include <dirent.h>
int main(int argc, char* argv[]){
lexOpen(argv[1]);
denseRIV* intake;
sparseRIV examine;
static denseRIV *output[60000] = {0};
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(argv[1]))){
printf("location not found, %s\n", argv[1]);
return 1;
}
int i=0;
int j=0;
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
/* the lexicon should not have valid sub-directories */
continue;
}
j++;
intake = lexPull(files->d_name);
/* if the vector has been encountered more than MINSIZE times
* then it should be statistically significant, and useful */
if(intake->contextSize<7000){
free(intake);
continue;
}
examine = normalize(*intake, 10000);
strcpy(examine.name, files->d_name);
printf("%d,%d,%lf,%d,%d\n", examine.frequency, examine.contextSize, examine.magnitude, i, j);
output[i] = calloc(1, sizeof(denseRIV));
addS2D(output[i]->values, examine);
output[i]->magnitude = examine.magnitude;
strcpy(output[i]->name, files->d_name);
output[i]->frequency = intake->frequency;
output[i]->contextSize = intake->contextSize;
free(intake);
free(examine.locations);
i++;
}
lexClose();
lexOpen("consolidatedLexicon50-8");
for(int j=0; j<i; j++){
lexPush(output[j]);
}
lexClose();
return 0;
}
File added
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#include "RIVtools.h"
#define THRESHOLD 0.70
/* this program identifies all near-duplicates among the documents in the
* chosen root directory, using RIV comparison */
// fills the fileRIVs array with a vector for each file in the root directory
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
int fileCount = 0;
//initializes the fileRIVs array to be reallocced by later function
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
//gather all vectors ino the fileRIVs array and count them in fileCount
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
//first calculate all magnitudes for later use
for(int i = 0; i < fileCount; i++){
fileRIVs[i].magnitude = getMagnitudeSparse(fileRIVs[i]);
}
clock_t begintotal = clock();
double cosine;
double minmag;
double maxmag;
//all cosines need a sparse-dense comparison. so we will create a
denseRIV baseDense;
for(int i = 0; i < fileCount; i++){
//0 out the denseVector, and map the next sparseVector to it
memset(&baseDense, 0, sizeof(denseRIV));
addS2D(baseDense.values, fileRIVs[i]);
//pass magnitude to the to the dense vector
baseDense.magnitude = fileRIVs[i].magnitude;
//if these two vectors are too different in size, we can know that they are not duplicates
minmag = baseDense.magnitude*.85;
maxmag = baseDense.magnitude*1.15;
for(int j = 0; j < i; j++){
//if this vector is within magnitude threshold
if(fileRIVs[j].magnitude < maxmag
&& fileRIVs[j].magnitude > minmag){
//identify the similarity of these two vectors
cosine = cosCompare(baseDense, fileRIVs[j]);
//if the two are similar enough to be flagged
if(cosine>THRESHOLD){
printf("%s\t%s\n%f\n", fileRIVs[i].name , fileRIVs[j].name, cosine);
}
}
}
}
printf("fileCount: %d", fileCount);
free(fileRIVs);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
return 0;
}
//mostly a standard recursive Dirent-walk
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
/* *** begin Dirent walk *** */
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(!files->d_name[0]) break;
while(*(files->d_name)=='.'){
files = readdir(directory);
}
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
continue;
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
/* *** end dirent walk, begin meat of function *** */
FILE *input = fopen(pathString, "r");
if(input){
*fileRIVs = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[*fileCount] = fileToL2(input);
strcpy((*fileRIVs)[*fileCount].name, pathString);
fclose(input);
*fileCount += 1;
}
}
}
#include <stdio.h>
#define RIVSIZE 25000
#define RIVSIZE 50000
#define CACHESIZE 0
#include "RIVtools.h"
#include <dirent.h>
......@@ -7,8 +7,6 @@
int main(int argc, char* argv[]){
lexOpen(argv[1]);
denseRIV* intake;
sparseRIV examine;
static denseRIV *output[60000] = {0};
DIR *directory;
struct dirent *files = 0;
......@@ -28,27 +26,15 @@ int main(int argc, char* argv[]){
intake = lexPull(files->d_name);
/* if the vector has been encountered more than MINSIZE times
* then it should be statistically significant, and useful */
if(intake->contextSize<10000)continue;
examine = normalize(*intake, 500);
strcpy(examine.name, files->d_name);
printf("%d,%d,%lf,%s\n", examine.frequency, examine.contextSize, examine.magnitude, examine.name);
output[i] = calloc(1, sizeof(denseRIV));
addS2D(output[i]->values, examine);
output[i]->magnitude = examine.magnitude;
strcpy(output[i]->name, files->d_name);
output[i]->frequency = intake->frequency;
printf("%d,%d,%lf,%d,%s\n", intake->frequency, intake->contextSize, intake->magnitude, i, files->d_name);
free(intake);
free(examine.locations);
i++;
}
lexClose();
/*lexOpen("consolidatedLexiconAggressive");
for(int j=0; j<i; j++){
lexPush(output[j]);
}
lexClose();*/
return 0;
}
......@@ -3,6 +3,17 @@
#include "RIVLower.h"
#include "RIVaccessories.h"
#include "assert.h"
#ifndef CACHEEXCLUSIVE
#define CACHEEXCLUSIVE 0
#endif
#define IODISPLACEMENT (sizeof(((sparseRIV*)0)->count)\
+ sizeof(((sparseRIV*)0)->frequency)\
+ sizeof(((sparseRIV*)0)->contextSize)\
+ sizeof(((sparseRIV*)0)->magnitude))\
/ sizeof(int)
int* IOstagingSlot = RIVKey.h_tempBlock+RIVSIZE; //#TODO format this better
/* lexOpen is called to "open the lexicon", setting up for later calls to
* lexPush and lexPull. if the lexicon has not been opened before calls
......@@ -24,7 +35,12 @@ void lexClose();
/* lexPush writes a denseRIV to the lexicon for permanent storage */
int lexPush(denseRIV* RIVout);
/* cacheCheckOnPush tests the state of this vector in our lexicon cache
* and returns 1 on "success" indicating cache storage and no need to push to file
* or returns 0 on "failure" indicating that the vector need be pushed to file
*/
int cacheCheckOnPush(denseRIV* RIVout);
/* lexPull reads a denseRIV from the lexicon, under "word"
* if the file does not exist, it creates a 0 vector with the name of word
* lexPull returns a denseRIV *pointer* because its data must be tracked
......@@ -32,6 +48,11 @@ int cacheCheckOnPush(denseRIV* RIVout);
*/
denseRIV* lexPull(char* word);
/* cacheCheckonPull checks if the word's vector is stored in cache,
* and returns a pointer to that vector on success
* or returns a NULL pointer if the word is not cached, indicating a need
* to pull from file
*/
denseRIV* cacheCheckOnPull(char* word);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
......@@ -51,6 +72,10 @@ denseRIV* fLexPull(FILE* lexWord);
/* redefines signal behavior to protect cached data against seg-faults etc*/
void signalSecure(int signum, siginfo_t *si, void* arg);
/* used exclusively by flexpush to determine write-style (sparse or dense)
* and also formats the "IOstagingSlot" for fwrite as a single block if sparse
*/
int saturationForStaging(denseRIV* output);
/* begin definitions */
void lexOpen(char* lexName){
......@@ -94,7 +119,8 @@ denseRIV* cacheCheckOnPull(char* word){
}
#endif
denseRIV* lexPull(char* word){
denseRIV* output;
denseRIV* output = NULL;
#if CACHESIZE > 0
......@@ -105,24 +131,30 @@ denseRIV* lexPull(char* word){
#endif /* CACHESIZE > 0 */
/* if not, attempt to pull the word data from lexicon file */
char pathString[200];
sprintf(pathString, "%s/%s", RIVKey.lexName, word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
strcpy(output->name, word);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons?
#if CACHEEXCLUSIVE == 0
/*if file does not exist, return a 0 vector (word is new to the lexicon */
output = calloc(1, sizeof(denseRIV));
strcpy(output->name, word);
#endif
/*if lexicon is set to exclusive, will return a NULL pointer instead of a 0 vector */
}
strcpy(output->name, word);
return output;
}
#if CACHESIZE > 0
......@@ -159,87 +191,170 @@ int cacheCheckOnPush(denseRIV* RIVout){
}
#endif
int lexPush(denseRIV* RIVout){
#if CACHESIZE > 0
#if CACHESIZE > 0
/* check the cache to see if it belongs in cache */
if(cacheCheckOnPush(RIVout)){
/* if the cache check returns 1, it has been dealth with in cache */
return 0;
}
#endif /* CACHESIZE != 0 */
/* find the cache-slot where this word belongs */
/* push to the lexicon */
return fLexPush(RIVout);
}
int saturationForStaging(denseRIV* output){
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* count = IOstagingSlot;
*count = 0;
*(count+1) = output->frequency;
*(count+2) = output->contextSize;
*(float*)(count+3) = output->magnitude;
int* locations = IOstagingSlot+4;
int* values = IOstagingSlot-RIVSIZE;;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */
if(output->values[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = output->values[i];
/* track size of forming sparseRIV */
*count += 1;
}
}
/* copy values into slot immediately after locations */
memcpy(locations+*count, values, (*count)*sizeof(int));
return *count;
}
int fLexPush(denseRIV* output){
char pathString[200] = {0};
denseRIV RIVout = *output;
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
sprintf(pathString, "%s/%s", RIVKey.lexName, output->name);
int saturation = saturationForStaging(output);
if( saturation < RIVSIZE/2){
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
if(temp.count<(RIVSIZE/2)){
/* smaller stored as sparse vector */
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
fwrite(IOstagingSlot, (saturation*2)+4, sizeof(int), lexWord);
fclose(lexWord);
}else{
/* saturation is too high, better to store dense */
/* there's gotta be a better way to do this */
temp.count = 0;
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
output->cached = 0;
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
return 1;
}
fwrite(&output->cached, sizeof(int), RIVSIZE+4, lexWord);
fclose(lexWord);
}
/* older way of writing, kept while debugging new */
//~ if(temp.count<(RIVSIZE/2)){
//~ /* smaller stored as sparse vector */
//~ *writeStaging = temp.count;
//~ stagingSize = sizeof(temp.count);
//~ memcpy(writeStaging+stagingSize, &RIVout.frequency, sizeof(int)*3);
//~ stagingSize += sizeof(int)*3;
//~ memcpy(writeStaging+stagingSize, temp.locations, temp.count*2*sizeof(int));
//~ stagingSize += temp.count*2*sizeof(int);
//~ fwrite(writeStaging, 1, stagingSize, lexWord);
//~ /*fwrite(&temp.count, 1, sizeof(size_t), lexWord);
//~ fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
//~ fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
//~ fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
//~ fwrite(temp.locations, temp.count, sizeof(int), lexWord);
//~ fwrite(temp.values, temp.count, sizeof(int), lexWord);*/
//~ }else{
//~ /* saturation is too high, better to store dense */
//~ /* there's gotta be a better way to do this */
//~ *writeStaging = 0;
//~ stagingSize = sizeof(temp.count);
//~ memcpy(writeStaging+stagingSize, &RIVout.frequency, sizeof(int)*3);
//~ stagingSize += sizeof(int)*3;
//~ memcpy(writeStaging+stagingSize, RIVout.values, sizeof(int)*RIVSIZE);
//~ stagingSize +=sizeof(int)*RIVSIZE;
//~ fwrite(writeStaging, 1, stagingSize, lexWord);
//~ /*fwrite(&temp.count, 1, sizeof(size_t), lexWord);
//~ fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
//~ fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
//~ fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
//~ fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);*/
//~ }
free(output);
free(temp.locations);
return 0;
}
denseRIV* fLexPull(FILE* lexWord){
denseRIV *output = calloc(1,sizeof(denseRIV));
size_t typeCheck;
int typeCheck;
/* get metadata for vector */
fread(&typeCheck, 1, sizeof(size_t), lexWord);
fread(&output->frequency, 1, sizeof(int), lexWord);
fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
fread(&output->magnitude, 1, sizeof(float), lexWord);
if(!fread(&typeCheck, 1, sizeof(int), lexWord)){
return NULL;
}
/* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){
/* pull as sparseVector */
sparseRIV temp;
/* value was not 0, so it's the value count */
temp.count = typeCheck;
sparseRIV* temp = (sparseRIV*) (IOstagingSlot-(sizeof(sparseRIV)/sizeof(int)-IODISPLACEMENT));
assert(&temp->count == IOstagingSlot);
temp->count = typeCheck;
temp->locations = IOstagingSlot+4;
temp->values = temp->locations+temp->count;
if (fread(&(temp->frequency), sizeof(int), (typeCheck* 2)+3, lexWord) != typeCheck*2 + 3){
printf("vector read failure");
return NULL;
}
temp.locations = (int*)malloc(temp.count*2*sizeof(int));
/*sparseRIV temp;
temp.count = typeCheck;
temp.locations = malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
fread(&output->frequency, 1, sizeof(int), lexWord);
fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
fread(&output->magnitude, 1, sizeof(float), lexWord);
fread(temp.locations, temp.count, sizeof(int), lexWord);
fread(temp.values, temp.count, sizeof(int), lexWord);
*/
addS2D(output->values, temp);
free(temp.locations);
addS2D(output->values, *temp);
}else{
/* typecheck is thrown away, just a flag in this case */
fread(output->values, RIVSIZE, sizeof(int), lexWord);
//~ fread(&output->frequency, 1, sizeof(int), lexWord);
//~ fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
//~ fread(&output->magnitude, 1, sizeof(float), lexWord);
if(fread(&output->frequency, sizeof(int), RIVSIZE+3, lexWord) != RIVSIZE+3){
printf("vector read failure");
return NULL;
}
}
......@@ -254,7 +369,6 @@ denseRIV* fLexPull(FILE* lexWord){
int cacheDump(){
int flag = 0;
for(int i = 0; i < CACHESIZE; i++){
if(RIVKey.RIVCache[i]){
......
No preview for this file type
No preview for this file type
......@@ -6,6 +6,7 @@
#include <dirent.h>
#include <error.h>
#include <string.h>
#define CACHESIZE 100000
#include "RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
......@@ -17,6 +18,7 @@ void directoryGrind(char *rootString);
void lineGrind(char* textLine);
int main(int argc, char *argv[]){
char pathString[1000];
//we open the lexicon, if it does not yet exist, it will be created
......@@ -69,7 +71,7 @@ void directoryGrind(char *rootString){
printf("skipped: %s\n", files->d_name);
continue;
}
puts(files->d_name);
//open a file within root directory
FILE *input = fopen(pathString, "r");
if(input){
......@@ -83,11 +85,11 @@ void directoryGrind(char *rootString){
void fileGrind(FILE* textFile){
char textLine[5000];
char textLine[10000];
// included python script separates paragraphs into lines
while(fgets(textLine, 4999, textFile)){
int i=0;
while(fgets(textLine, 9999, textFile)){
printf("line: %d\n", i++);
if(!strlen(textLine)) continue;
if(feof(textFile)) break;
......@@ -100,6 +102,10 @@ void fileGrind(FILE* textFile){
void lineGrind(char* textLine){
//extract a context vector from this text set
sparseRIV contextVector = textToL2(textLine);
if(contextVector.contextSize <= 1){
free(contextVector.locations);
return;
}
denseRIV* lexiconRIV;
//identify stopping point in line read
......@@ -110,6 +116,7 @@ void lineGrind(char* textLine){
sscanf(textLine, "%99s%n", word, &displacement);
//we ensure that each word exists, and is free of unwanted characters
textLine += displacement+1;
if(!(*word))continue;
if(!isWordClean((char*)word)){
......@@ -132,7 +139,7 @@ void lineGrind(char* textLine){
//and finally we push it back to the lexicon for permanent storage
lexPush(lexiconRIV);
textLine += displacement+1;
}
//free the heap allocated context vector data
......
No preview for this file type
No preview for this file type
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
import numpy as np
import matplotlib.pyplot as plt
import math
def fit(x):
return 1*(1067+94500000/x)
data = open("../code/RIVet/graphdata.txt", "r");
frequencies = [];
mags = [];
i = 0;
for line in data:
if(int(line.split(",")[1])>40000):
x = 7
range = 0.15
while(1):
range = input("gimmerange");
data = open("graphdata.txt", "r");
frequencies = [];
mags = [];
fitline = [];
i = 0;
for line in data:
segments = line.split(",")
freq = int(segments[1])
mag = float(segments[2])
name = segments[4];
if(freq>40000):
continue;
frequencies.append(int(line.split(",")[1]))
mags.append(float(line.split(",")[2]))
if(mags[i]>80 and frequencies[i]>7000 and frequencies[i]<15000):
print(line)
core = fit(freq)
fitmax = core*(1+range);
fitmin = core*(1-range);
if(mag >fitmax or mag < fitmin):
continue
frequencies.append(freq)
mags.append(mag)
fitline.append(fit(freq));
print("{} {} {}".format(name, freq, mag))
i+=1
plt.scatter(frequencies, mags)
plt.show()
#plt.scatter(frequencies, mags)
plt.plot(frequencies, fitline, 'r^', frequencies, mags, 'bs')
plt.show()
x+=1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment