Commit fe20c6f5 by etcart

updated lots of stuff

parent 60856c1d
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "stemconfig/stemset.h"
struct treenode{ struct treenode{
void* data; void* data;
...@@ -11,14 +12,14 @@ struct treenode{ ...@@ -11,14 +12,14 @@ struct treenode{
struct treenode* links[26]; struct treenode* links[26];
int downstream; int downstream;
}; }*nextNode;
void stemInsert(struct treenode* node, char* letter, void* data);
int treecut(struct treenode* node, char* letter); int treecut(struct treenode* node, char* letter);
void stemInsert(struct treenode* node, char* letter, char* data);
void RIVinsert(struct treenode* node, char* letter, void* data); void treeInsert(struct treenode* node, char* letter, void* data);
void* treeSearch(struct treenode* node, char* letter); void* treeSearch(struct treenode* node, char* letter);
struct treenode* stemTreeSetup(); struct treenode* stemTreeSetup();
/*isWordClean filters words that contain non-letter characters, and /*isWordClean filters words that contain non-letter characters, and
* upperCase letters, allowing only the '_' symbol through * upperCase letters, allowing only the '_' symbol through
*/ */
...@@ -64,27 +65,34 @@ int wordtoSeed(char* word){ ...@@ -64,27 +65,34 @@ int wordtoSeed(char* word){
return seed; return seed;
} }
struct treenode* stemTreeSetup(){ struct treenode* stemTreeSetup(){
FILE* netfile = fopen("stemnet2.txt", "r"); FILE* wordFile = fopen("stemconfig/wordset.txt", "r");
if(!netfile){ if(!wordFile){
printf("no stemnet file"); printf("no wordnet file");
return 0; return 0;
} }
struct treenode* rootNode = calloc(1, sizeof(struct treenode)); struct treenode* rootNode = calloc(treesize, sizeof(struct treenode));
nextNode = rootNode+1;
char word[100]; char word[100];
char stem[100]; char* stem = (char*)stemset;
int displacement;
while(fscanf(netfile, "%s %s", word, stem)){ while(fscanf(wordFile, "%s", word)){
if(feof(netfile)){ sscanf(stem, "%*s%n", &displacement);
break; stem[displacement] = '\0';
}
stemInsert(rootNode, word, stem);
stemInsert(rootNode, word, stem);
if(feof(wordFile)){
break;
}
stem += displacement+1;
} }
fclose(wordFile);
return rootNode; return rootNode;
} }
void* treeSearch(struct treenode* node, char* letter){ void* treeSearch(struct treenode* node, char* letter){
...@@ -100,15 +108,15 @@ void* treeSearch(struct treenode* node, char* letter){ ...@@ -100,15 +108,15 @@ void* treeSearch(struct treenode* node, char* letter){
return node->data; return node->data;
} }
} }
void RIVinsert(struct treenode* node, char* letter, void* data){ void stemInsert(struct treenode* node, char* letter, void* data){
node->downstream++; node->downstream++;
if(*(letter)){ if(*(letter)){
if(!node->links[*(letter)-'a']){ if(!node->links[*(letter)-'a']){
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode)); node->links[*(letter)-'a'] = nextNode++;
} }
RIVinsert(node->links[*(letter)-'a'], letter+1, data); treeInsert(node->links[*(letter)-'a'], letter+1, data);
}else{ }else{
...@@ -119,44 +127,47 @@ void RIVinsert(struct treenode* node, char* letter, void* data){ ...@@ -119,44 +127,47 @@ void RIVinsert(struct treenode* node, char* letter, void* data){
} }
} }
void stemInsert(struct treenode* node, char* letter, char* data){ void treeInsert(struct treenode* node, char* letter, void* data){
node->downstream++; node->downstream++;
if(*(letter)){ if(*(letter)){
if(!node->links[*(letter)-'a']){ if(!node->links[*(letter)-'a']){
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode)); node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
} }
stemInsert(node->links[*(letter)-'a'], letter+1, data); treeInsert(node->links[*(letter)-'a'], letter+1, data);
}else{ }else{
if(node->data) return;
node->data = calloc(strlen(data)+1, sizeof(char));
if(node->data) return;
node->data = data;
strcpy((char*)node->data, data);
} }
} }
int treecut(struct treenode* node, char* letter){ int treecut(struct treenode* node, char* letter){
node->downstream--; node->downstream--;
int flag; int flag;
//continue searching downstream if there is a letter
if(*(letter)){ if(*(letter)){
if(node->links[*(letter)-'a']){ if(node->links[*(letter)-'a']){
//propagate to next section
flag = treecut(node->links[*(letter)-'a'], letter+1); flag = treecut(node->links[*(letter)-'a'], letter+1);
//if next section returned a "cut" flag, 0 it out
if(flag){ if(flag){
node->links[*(letter)-'a'] = NULL; node->links[*(letter)-'a'] = NULL;
} }
} }
if(!node->downstream){ //there are no more letters, we've reached our destination
free(node);
return 1;
}
}else{ }else{
node->data = NULL;
}
//this is on a branch that leads nowhere, free it and return "cut" flag
if(!node->downstream){
free(node); free(node);
return 1; return 1;
} }
...@@ -164,5 +175,17 @@ int treecut(struct treenode* node, char* letter){ ...@@ -164,5 +175,17 @@ int treecut(struct treenode* node, char* letter){
} }
void destroyTree(struct treenode* node){
if(node->data) free(node->data);
for(int i=0; i<26; i++){
if(node->links[i]){
destroyTree(node->links[i]);
}
}
free(node);
}
#endif #endif
No preview for this file type
File added
#include <stdio.h> #include <stdio.h>
#define RIVSIZE 50000 #define RIVSIZE 50000
#define CACHESIZE 20000
#include "RIVtools.h" #include "RIVtools.h"
char* clean(char* word); #define k 5
char* stemmy(struct treenode* searchRoot, char* word);
sparseRIV line2L3(char* text, struct treenode* searchRoot);
typedef char label[200]; typedef char label[200];
struct RIVclass{ struct RIVclass{
label name; label name;
sparseRIV* set; sparseRIV* set;
int setSize; int setSize;
}; };
char* clean(char* word);
char* stemmy(struct treenode* searchRoot, char* word);
sparseRIV line2L3(char* text, struct treenode* searchRoot);
int kNearest(double* weights, struct RIVclass* classes, int classCount, denseRIV inQuestion);
LEXICON* lexicon; LEXICON* lexicon;
int main(){ int main(){
struct treenode* searchRoot = stemTreeSetup(); struct treenode* searchRoot = stemTreeSetup();
lexicon = lexOpen("consolidatedLexicon", "rx"); lexicon = lexOpen("lexiconEnron50-4", "rx");
int classNo = 0; int classNo = 0;
...@@ -25,18 +30,38 @@ int main(){ ...@@ -25,18 +30,38 @@ int main(){
FILE* textSet = fopen("../../Downloads/labeledText.tsv", "r"); FILE* textSet = fopen("../../Downloads/trainingText.tsv", "r");
if(!textSet){ if(!textSet){
puts("no file"); puts("no file");
return 1; return 1;
} }
struct RIVclass* class; struct RIVclass* class = 0;
char text[20000]; char text[20000];
label className; label className;
while(fscanf(textSet, "%s\t%s", text, className)){ //int j=0;
while(fscanf(textSet, "%[^\t]\t%[^\n]", text, className)){
//if(j++>100) break;
if(feof(textSet)) break;
char* labelTemp = strstr(*classNames, className);
if(!labelTemp){ sparseRIV temp = line2L3(text, searchRoot);
temp.magnitude = getMagnitudeSparse(temp);
if(temp.magnitude == 0){
printf("%s, empty\n", text);
continue;
}
//printf("%s, %s", text, className);
int i=0;
for(; i< classCount; i++){
if(!strcmp(className, classNames[i])){
classNo = i;
class = classes+classNo;
break;
}
}
if(i == classCount){
/* reinitialize the classnames with a new member */ /* reinitialize the classnames with a new member */
classNames = realloc(classNames, (classCount+1)*sizeof(label)); classNames = realloc(classNames, (classCount+1)*sizeof(label));
strcpy(classNames[classCount], className); strcpy(classNames[classCount], className);
...@@ -53,14 +78,10 @@ int main(){ ...@@ -53,14 +78,10 @@ int main(){
classNo = classCount; classNo = classCount;
classCount++; classCount++;
}else{
classNo = (labelTemp-*classNames);
class = classes+classNo;
} }
class->set = realloc(class->set, (class->setSize+1) *sizeof(sparseRIV)); class->set = realloc(class->set, (class->setSize+1) *sizeof(sparseRIV));
sparseRIV thing= line2L3(text, searchRoot); sparseRIV thing= temp;
class->set[class->setSize] = thing; class->set[class->setSize] = thing;
class->setSize++; class->setSize++;
...@@ -69,10 +90,71 @@ int main(){ ...@@ -69,10 +90,71 @@ int main(){
for(int i=0; i<classCount; i++){ for(int i=0; i<classCount; i++){
puts(classNames[i]); puts(classNames[i]);
puts(classes[i].name);
printf("%d\n\n", classes[i].setSize); printf("%d\n\n", classes[i].setSize);
} }
fclose(textSet);
textSet = fopen("../../Downloads/validationText.tsv", "r");
if(!textSet) return 1;
int won = 0;
int docTotal = 0;
//scanf("%d", &won);
//j=0;
while(fscanf(textSet, "%[^\t]\t%[^\n]", text, className)){
if(feof(textSet)) break;
//if(j++>30) break;
int i=0;
for(; i< classCount; i++){
if(!strcmp(className, classNames[i])){
classNo = i;
class = classes+classNo;
break;
}
}if(i == classCount){
printf("unclassifiable\n");
continue;
}
sparseRIV thing= line2L3(text, searchRoot);
if(thing.count ==0){
continue;
}
docTotal++;
denseRIV inQuestion = {0};
addS2D(inQuestion.values, thing);
inQuestion.magnitude = getMagnitudeDense(&inQuestion);
double weights[classCount];
int choice = kNearest(weights, classes, classCount, inQuestion);
if(choice == -1){
printf("classificationFailed");
}else{
//puts(text);
printf("survey says! %s ", className);
printf("your asnwer was...%d, %s\n", choice, classes[choice].name);
}
if(choice == classNo){
won++;
}
free(thing.locations);
}
printf("\n\n we got %d/%d ", won, docTotal);
for(int i=0; i<classCount; i++){
for(int j=0; j<classes[i].setSize; j++){
free(classes[i].set[j].locations);
}
free(classes[i].set);
}
free(classes);
free(classNames);
destroyTree(searchRoot);
lexClose(lexicon);
fclose(textSet);
return 0; return 0;
} }
...@@ -132,26 +214,74 @@ sparseRIV line2L3(char* text, struct treenode* searchRoot){ ...@@ -132,26 +214,74 @@ sparseRIV line2L3(char* text, struct treenode* searchRoot){
continue; continue;
}else{ }else{
//printf("%s, succesfully pulled\n", stem); //printf("%s, succesfully pulled\n", stem);
temp = consolidateD2S(wordRIV->values); temp = normalize(*wordRIV, 10000);
//temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp); addS2D(accumulate.values, temp);
free(temp.locations); free(temp.locations);
free(wordRIV); //free(wordRIV);
lexPush(lexicon, wordRIV);
} }
} }
} }
temp = consolidateD2S(accumulate.values); temp = consolidateD2S(accumulate.values);
return temp; return temp;
}
int kNearest(double* weights, struct RIVclass* classes, int classCount, denseRIV inQuestion){
int choice = -1;
memset(weights, 0, classCount*sizeof(double));
double distances[k] = {-2};
int labels[k] = {0};
int fill = 0;
for(int i=0; i<classCount; i++){
for(int j=0; j<classes[i].setSize; j++){
double cosine = cosCompare(inQuestion, classes[i].set[j]);
if(fill < k){
distances[fill] = cosine;
fill++;
continue;
}
for(int x = 0; x<k; x++){
if(cosine>distances[x]){
distances[x] = cosine;
labels[x] = i;
break;
}
}
}
}
double totalweight = 0;
for(int i=0; i<classCount; i++){
for(int j = 0; j<k; j++){
if(labels[j] == i){
weights[i] += distances[j];
totalweight += distances[j];
}
}
}
double tempmax = -2;
for(int i=0; i<classCount; i++){
weights[i] /= totalweight;
if(weights[i] > tempmax){
choice = i;
tempmax = weights[i];
}
}
return choice;
} }
......
File added
#ifndef RIV_LEXICON_H #ifndef RIV_LEXICON_H
#define RIV_LEXICON_H #define RIV_LEXICON_H
#include "RIVLower.h" #include "RIVLower.h"
#include "RIVaccessories.h" #include "RIVaccessories.h"
#include "assert.h" #include "assert.h"
#ifndef READFLAG /* these flags will be used by the lexicon to know its permissions and states */
#define READFLAG 0x01 #ifndef READFLAG
#endif #define READFLAG 0x01
#endif
#ifndef WRITEFLAG
#define WRITEFLAG 0x02 #ifndef WRITEFLAG
#endif #define WRITEFLAG 0x02
#endif
#ifndef INCFLAG
#define INCFLAG 0x04 #ifndef INCFLAG
#endif #define INCFLAG 0x04
#endif
#ifndef CACHEFLAG
#define CACHEFLAG 0x08 #ifndef CACHEFLAG
#endif #define CACHEFLAG 0x08
#endif
#ifndef SORTCACHE
#ifndef HASHCACHE /* if user has specified neither hashed nor sorted cache we assume sorted
#define SORTCACHE * hashed strategy is extremely CPU and memory light, but very inneffective
#endif * at ensuring the most important vectors are cached. as such it is better
#endif * optimized for RAMdisks and unusually fast SSDs. the sorted strategy
typedef struct{ * is much more expensive for the CPU, but ensures the minimum possible
char lexName[100]; * hard-drive read writes far more effectively */
denseRIV* *cache;
struct cacheList* listPoint; #ifndef SORTCACHE
char flags; #ifndef HASHCACHE
#ifdef SORTCACHE #define SORTCACHE
struct treenode* treeRoot; #endif
#endif /* SORTCACHE */ #endif
}LEXICON; /* the LEXICON struct will be used similar to a FILE (as a pointer) which
struct cacheList{ * contains all metadata that a lexicon needs in order to be read and written to safely*/
denseRIV* *cache; typedef struct{
struct cacheList* next; char lexName[100];
struct cacheList* prev; denseRIV* *cache;
}*rootCache = NULL; struct cacheList* listPoint;
char flags;
#define IODISPLACEMENT (sizeof(((sparseRIV*)0)->count)\ #ifdef SORTCACHE
+ sizeof(((sparseRIV*)0)->frequency)\ /* if our cache is sorted, we will need a search tree and a saturation */
+ sizeof(((sparseRIV*)0)->contextSize)\ struct treenode* treeRoot;
+ sizeof(((sparseRIV*)0)->magnitude))\ int cacheSaturation;
/ sizeof(int) denseRIV* *cache_slider;
int* IOstagingSlot = RIVKey.h_tempBlock+RIVSIZE; //#TODO format this better #endif /* SORTCACHE */
}LEXICON;
/* this will form a linked list of caches, so that all data can be safely dumped
/* lexOpen is called to "open the lexicon", setting up for later calls to * in event of an error, no matter how many or how strangely lexica have
* lexPush and lexPull. if the lexicon has not been opened before calls * been opened and closed */
* to these functions, their behavior can be unpredictable, most likely crashing struct cacheList{
*/ denseRIV* *cache;
LEXICON* lexOpen(const char* lexName, const char* flags); struct cacheList* next;
struct cacheList* prev;
/* lexClose should always be called after the last lex push or lex pull call }*rootCache = NULL;
* if the lexicon is left open, some vector data may be lost due to
* un-flushed RIV cache /* IOstagingSlot is used by fLexPush to preformat data to be written in a single
*/ * fwrite() call. it has room for RIVSIZE integers behind it and 2*RIVSIZE
void lexClose(LEXICON*); * integers ahead of it, which the function saturationForStaging() will need */
int* IOstagingSlot = RIVKey.h_tempBlock+RIVSIZE;
/* both lexPush and lexPull must be called *after* the lexOpen() function /* lexOpen is called to "open the lexicon", setting up for later calls to
* and after using them the lexClose() function must be called to ensure * lexPush and lexPull. if the lexicon has not been opened before calls
* data security */ * to these functions, their behavior can be unpredictable, most likely crashing
* lexOpen accepts flags: r, w, x.
/* lexPush writes a denseRIV to the lexicon for permanent storage */ * r: for reading, currently meaningless, it wont stop you reading if you don't have this
int lexPush(LEXICON* lexicon, denseRIV* RIVout); * w: for writing. if a readonly lexicon is "written to" no data will be saved in hardcopy
* although it will be cached if possible, so that later pulls will be optimized
/* cacheCheckOnPush tests the state of this vector in our lexicon cache * x: exclusive. will not accept new words, lexPull returns a NULL pointer
* and returns 1 on "success" indicating cache storage and no need to push to file * and lexPush simply frees any word which is not already in the lexicon
* or returns 0 on "failure" indicating that the vector need be pushed to file */
*/ LEXICON* lexOpen(const char* lexName, const char* flags);
int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout);
/* lexClose should always be called after the last lex push or lex pull call
/* lexPull reads a denseRIV from the lexicon, under "word" * if the lexicon is left open, some vector data may be lost due to
* if the file does not exist, it creates a 0 vector with the name of word * un-flushed RIV cache. also frees up data, memory leaks if lexicon is not closed
* lexPull returns a denseRIV *pointer* because its data must be tracked */
* globally for key optimizations void lexClose(LEXICON*);
*/
denseRIV* lexPull(LEXICON* lexicon, char* word); /* both lexPush and lexPull must be called *after* the lexOpen() function
* and after using them the lexClose() function must be called to ensure
/* cacheCheckonPull checks if the word's vector is stored in cache, * data security (only after the final push or pull, not regularly during operation */
* and returns a pointer to that vector on success
* or returns a NULL pointer if the word is not cached, indicating a need /* lexPush writes a denseRIV to the lexicon for permanent storage */
* to pull from file int lexPush(LEXICON* lexicon, denseRIV* RIVout);
*/
denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word); /* lexPull reads a denseRIV from the lexicon, under "word"
* if the file does not exist, it creates a 0 vector with the name of word
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file, * lexPull returns a denseRIV *pointer* because its data must be tracked
* saving it for long-term aggregation. function is called by "lexPush", * globally for key optimizations
* which is what users should actually use. lexPush, unlike fLexPush, */
* has cache logic under the hood for speed and harddrive optimization denseRIV* lexPull(LEXICON* lexicon, char* word);
*/
int fLexPush(LEXICON* lexicon, denseRIV* RIVout); /* cacheCheckOnPush tests the state of this vector in our lexicon cache
* and returns 1 on "success" indicating cache storage and no need to push to file
/* flexPull pulls data directly from a file and converts it (if necessary) * or returns 0 on "failure" indicating that the vector need be pushed to file
* to a denseRIV. function is called by "lexPull" which is what users */
* should actually use. lexPull, unlike FlexPull, has cache logic under int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout);
* the hood for speed and harddrive optimization
*/ /* cacheCheckonPull checks if the word's vector is stored in cache,
denseRIV* fLexPull(FILE* lexWord); * and returns a pointer to that vector on success
* or returns a NULL pointer if the word is not cached, indicating a need
/* redefines signal behavior to protect cached data against seg-faults etc*/ * to pull from file
void signalSecure(int signum, siginfo_t *si, void* arg); */
denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word);
/* used exclusively by flexpush to determine write-style (sparse or dense)
* and also formats the "IOstagingSlot" for fwrite as a single block if sparse /* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
*/ * saving it for long-term aggregation. function is called by "lexPush",
int saturationForStaging(denseRIV* output); * which is what users should actually use. lexPush, unlike fLexPush,
/* begin definitions */ * has cache logic under the hood for speed and harddrive optimization
LEXICON* lexOpen(const char* lexName, const char* flags){ */
LEXICON* output = calloc(1, sizeof(LEXICON)); int fLexPush(LEXICON* lexicon, denseRIV* RIVout);
/* identify the presence of read, write, and exclusive flags */
char* r = strstr(flags, "r"); /* flexPull pulls data directly from a file and outputs it as a denseRIV.
char* w = strstr(flags, "w"); * function is called by "lexPull" which is what users
char* x = strstr(flags, "x"); * should actually use. lexPull, unlike FlexPull, has cache logic under
struct stat st = {0}; * the hood for speed and harddrive optimization
*/
denseRIV* fLexPull(FILE* lexWord);
if(w){
/* if set to write, we check and create if necessary, the lexicon */ /* redefines signal behavior to protect cached data against seg-faults etc*/
if (stat(lexName, &st) == -1) { void signalSecure(int signum, siginfo_t *si, void* arg);
mkdir(lexName, 0777); int cacheDump(denseRIV* *toDump);
}
output->flags |= WRITEFLAG; /* used exclusively by flexpush to determine write-style (sparse or dense)
}else if(r){ * and also formats the "IOstagingSlot" for fwrite as a single block if sparse
/* if set to read and not write, return null if lexicon does not exist */ */
if (stat(lexName, &st) == -1) { int saturationForStaging(denseRIV* output);
free(output); /* begin definitions */
return NULL; LEXICON* lexOpen(const char* lexName, const char* flags){
} LEXICON* output = calloc(1, sizeof(LEXICON));
output->flags |= READFLAG; /* identify the presence of read, write, and exclusive flags */
} char* r = strstr(flags, "r");
/* if not set to exclusive, set the inclusive flag */ char* w = strstr(flags, "w");
if(!x){ char* x = strstr(flags, "x");
output->flags |= INCFLAG; struct stat st = {0};
}
strcpy(output->lexName, lexName);
if(w){
/* if set to write, we check and create if necessary, the lexicon */
#if CACHESIZE > 0 if (stat(lexName, &st) == -1) {
mkdir(lexName, 0777);
if(r && w){ }
//#TODO include hash vs sort cache logic flags /* flag for writing*/
/* if we will be reading and writing the same lexicon, setup a output->flags |= WRITEFLAG;
* cache for this lexicon to speed up rewrites */ }else if(r){
struct cacheList* newCache = calloc(1, sizeof(struct cacheList)); /* if set to read and not write, return null if lexicon does not exist */
#ifdef HASHCACHE if (stat(lexName, &st) == -1) {
newCache->cache = calloc(CACHESIZE, sizeof(denseRIV*)); free(output);
#else return NULL;
#ifdef SORTCACHE }
newCache->cache = calloc(CACHESIZE+1, sizeof(denseRIV*)); /* flag for reading */
output->treeRoot = calloc(1, sizeof(struct treenode)); output->flags |= READFLAG;
#endif }
#endif /* if not set to exclusive, set the inclusive flag */
output->flags |= CACHEFLAG; if(!x){
/* flag inclusive (will return unknown words as 0 vector */
output->cache = newCache->cache; output->flags |= INCFLAG;
newCache->next = rootCache; }
if(rootCache){ /* record the name of the lexicon */
rootCache->prev = newCache; strcpy(output->lexName, lexName);
}
rootCache = newCache; #if CACHESIZE > 0
output->listPoint = newCache; output->cache = calloc(CACHESIZE, sizeof(denseRIV*));
struct sigaction action = {0};
action.sa_sigaction = signalSecure; #ifdef SORTCACHE
action.sa_flags = SA_SIGINFO; /* a sorted cache needs a search tree for finding RIVs by name */
output->treeRoot = calloc(1, sizeof(struct treenode));
for(int i=1; i<27; i++){ output->cacheSaturation = 0;
sigaction(i,&action,NULL); output->cache_slider = output->cache+CACHESIZE;
} #endif /* SORTCACHE */
}
#endif /* flag cached ?? */
output->flags |= CACHEFLAG;
return output; if(w){
} /* setup cache-list element for break dumping */
void lexClose(LEXICON* toClose){ struct cacheList* newCache = calloc(1, sizeof(struct cacheList));
newCache->cache = output->cache;
#if CACHESIZE>0
if(toClose->flags & CACHEFLAG){ newCache->next = rootCache;
if(cacheDump(toClose->cache)){ if(rootCache){
puts("cache dump failed, some lexicon data was lost"); rootCache->prev = newCache;
} }
struct cacheList* listPoint = toClose->listPoint; rootCache = newCache;
if(listPoint->prev){ output->listPoint = newCache;
listPoint->prev->next = toClose->listPoint->next;
} struct sigaction action = {0};
if(listPoint->next){ action.sa_sigaction = signalSecure;
listPoint->next->prev = toClose->listPoint->prev; action.sa_flags = SA_SIGINFO;
}
free(listPoint); for(int i=1; i<27; i++){
} sigaction(i,&action,NULL);
#endif }
free(toClose); }
}
#endif /* CACHESIZE > 0 */
return output;
#if CACHESIZE > 0 }
denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){ void lexClose(LEXICON* toClose){
#ifdef HASHCACHE
srand(wordtoSeed(word)); #if CACHESIZE>0
int hash = rand()%CACHESIZE; if(toClose->flags & WRITEFLAG){
if(lexicon->cache[hash]){ if(cacheDump(toClose->cache)){
if(!strcmp(word, lexicon->cache[hash]->name)){ puts("cache dump failed, some lexicon data was lost");
/* if word is cached, pull from cache and exit */ }
return lexicon->cache[hash]; struct cacheList* listPoint = toClose->listPoint;
} if(listPoint->prev){
} listPoint->prev->next = toClose->listPoint->next;
return NULL; }
#endif if(listPoint->next){
#ifdef SORTCACHE listPoint->next->prev = toClose->listPoint->prev;
}
return treeSearch(lexicon->treeRoot, word); free(listPoint);
}
#endif #endif
} free(toClose);
}
int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
/* if our RIV was cached already, no need to play with it */
if(RIVout->cached == lexicon){ #if CACHESIZE > 0
return 1; denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
} #ifdef HASHCACHE
#ifdef HASHCACHE /* we find which cache entry this word belongs in by simple hashing */
srand(wordtoSeed(RIVout->name)); srand(wordtoSeed(word));
int hash = rand()%CACHESIZE; int hash = rand()%CACHESIZE;
if(lexicon->cache[hash]){
/* if there is no word in this cache slot */ if(!strcmp(word, lexicon->cache[hash]->name)){
if(!lexicon->cache[hash]){ /* if word is cached, pull from cache and exit */
/* push to cache instead of file */ return lexicon->cache[hash];
lexicon->cache[hash] = RIVout; }
lexicon->cache[hash]->cached = lexicon; }
return 1; return NULL;
/*if the current RIV is more frequent than the RIV holding its slot */ #endif
} #ifdef SORTCACHE
if(RIVout->frequency > lexicon->cache[hash]->frequency ){ /* use a treeSearch (found in RIVaccessories) to find the denseRIV* in the cache */
/* push the lower frequency cache entry to a file */ return treeSearch(lexicon->treeRoot, word);
fLexPush(lexicon, lexicon->cache[hash]);
/* replace this cache-slot with the current vector */ #endif
}
lexicon->cache[hash] = RIVout;
lexicon->cache[hash]->cached = lexicon; int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
return 1; /* if our RIV was cached already, no need to play with it */
} if(RIVout->cached == lexicon){
return 0; /* return "success" the vector is already in cache and updated */
#endif /* HASHCACHE */ return 1;
#ifdef SORTCACHE }
denseRIV* *cache_slider = lexicon->cache; #ifdef HASHCACHE
while(*cache_slider){ srand(wordtoSeed(RIVout->name));
if(RIVout->frequency > (*cache_slider)->frequency){ int hash = rand()%CACHESIZE;
memcpy(cache_slider+1, cache_slider, CACHESIZE-(cache_slider-lexicon->cache));
if(lexicon->cache[CACHESIZE]){ /* if there is no word in this cache slot */
if(!lexicon->cache[hash]){
fLexPush(lexicon, lexicon->cache[CACHESIZE]); /* push to cache instead of file */
//remove tree element lexicon->cache[hash] = RIVout;
treecut(lexicon->treeRoot, RIVout->name); lexicon->cache[hash]->cached = lexicon;
lexicon->cache[CACHESIZE] = NULL; /* return "success" */
} return 1;
RIVout->cached = lexicon; /*if the current RIV is more frequent than the RIV holding its slot */
*cache_slider = RIVout; }
//add tree element if(RIVout->frequency > lexicon->cache[hash]->frequency ){
RIVinsert(lexicon->treeRoot, RIVout->name, RIVout); /* push the lower frequency cache entry to a file */
fLexPush(lexicon, lexicon->cache[hash]);
return 1; /* replace this cache-slot with the current vector */
}
lexicon->cache[hash] = RIVout;
cache_slider++; lexicon->cache[hash]->cached = lexicon;
} /* return "success" */
if(cache_slider-lexicon->cache < CACHESIZE){ return 1;
RIVout->cached = lexicon; }
*cache_slider = RIVout; return 0;
RIVinsert(lexicon->treeRoot, RIVout->name, RIVout); #endif /* HASHCACHE */
//add tree element #ifdef SORTCACHE
return 1;
} /* if the cache is not yet full, append this vector to the accumulating list */
return 0; if (lexicon->cacheSaturation < CACHESIZE){
#endif /* SORTCACHE */ RIVout->cached = lexicon;
} lexicon->cache[lexicon->cacheSaturation] = RIVout;
treeInsert(lexicon->treeRoot, RIVout->name, RIVout);
#endif
denseRIV* lexPull(LEXICON* lexicon, char* word){
lexicon->cacheSaturation = lexicon->cacheSaturation+1;
denseRIV* output = NULL; /* return "success" */
return 1;
#if CACHESIZE > 0 }else{ /* if cache is full */
if(lexicon->flags & CACHEFLAG){
/* if there is a cache, first check if the word is cached */ RIVout->cached = lexicon;
if((output = cacheCheckOnPull(lexicon, word))){ denseRIV* toCheck = RIVout;
return output; denseRIV* temp;
}
} while(1){
#endif /* CACHESIZE > 0 */ if(lexicon->cache_slider == lexicon->cache){
lexicon->cache_slider += CACHESIZE;
/* if not, attempt to pull the word data from lexicon file */ }
char pathString[200]; (lexicon->cache_slider)--;
if(toCheck->frequency > (*lexicon->cache_slider)->frequency){
sprintf(pathString, "%s/%s", lexicon->lexName, word); temp = (*lexicon->cache_slider);
(*lexicon->cache_slider) = toCheck;
FILE *lexWord = fopen(pathString, "rb"); toCheck = temp;
}else{
/* if this lexicon file already exists */ if(toCheck == RIVout){
if(lexWord){ return 0;
/* pull data from file */ }else{
treecut(lexicon->treeRoot, toCheck->name);
output = fLexPull(lexWord); fLexPush(lexicon, toCheck);
strcpy(output->name, word); treeInsert(lexicon->treeRoot, RIVout->name, RIVout);
fclose(lexWord); return 1;
}else{ }
/* if lexicon is set to inclusive (can gain new words) */ break;
if(lexicon->flags & INCFLAG){ }
}
/*if file does not exist, return a 0 vector (word is new to the lexicon) */ }
output = calloc(1, sizeof(denseRIV)); /* return "failure" */
strcpy(output->name, word); return 0;
}
/*if lexicon is set to exclusive, will return a NULL pointer instead of a 0 vector */ #endif /* SORTCACHE */
} }
#endif
denseRIV* lexPull(LEXICON* lexicon, char* word){
return output;
} denseRIV* output = NULL;
int lexPush(LEXICON* lexicon, denseRIV* RIVout){ #if CACHESIZE > 0
if(lexicon->flags & CACHEFLAG){
#if CACHESIZE > 0 /* if there is a cache, first check if the word is cached */
if(lexicon->flags & CACHEFLAG){ if((output = cacheCheckOnPull(lexicon, word))){
/* check the cache to see if it belongs in cache */ return output;
if(cacheCheckOnPush(lexicon, RIVout)){ }
/* if the cache check returns 1, it has been dealth with in cache */ }
return 0; #endif /* CACHESIZE > 0 */
}
} /* if not, attempt to pull the word data from lexicon file */
char pathString[200];
#endif
sprintf(pathString, "%s/%s", lexicon->lexName, word);
/* push to the lexicon */ FILE *lexWord = fopen(pathString, "rb");
return fLexPush(lexicon, RIVout);
/* if this lexicon file already exists */
} if(lexWord){
/* pull data from file */
int saturationForStaging(denseRIV* output){
output = fLexPull(lexWord);
/* key/value pairs will be loaded to a worst-case sized temporary slot */ if(!output){
return NULL;
int* count = IOstagingSlot; }
*count = 0; /* record the "name" of the vector, as the word */
*(count+1) = 0; strcpy(output->name, word);
*(count+2) = output->frequency; fclose(lexWord);
*(count+3) = output->contextSize; }else{
*(float*)(count+4) = output->magnitude; /* if lexicon is set to inclusive (can gain new words) */
if(lexicon->flags & INCFLAG){
int* locations = IOstagingSlot+5;
int* values = IOstagingSlot-RIVSIZE;; /*if file does not exist, return a 0 vector (word is new to the lexicon) */
int* locations_slider = locations; output = calloc(1, sizeof(denseRIV));
int* values_slider = values; /* record the "name" of the vector, as the word */
for(int i=0; i<RIVSIZE; i++){ strcpy(output->name, word);
}else{
/* act only on non-zeros */ /*if lexicon is set to exclusive, will return a NULL pointer instead of a 0 vector */
if(output->values[i]){ return NULL;
}
/* assign index to locations */
*(locations_slider++) = i; }
return output;
/* assign value to values */ }
*(values_slider++) = output->values[i];
int lexPush(LEXICON* lexicon, denseRIV* RIVout){
/* track size of forming sparseRIV */
*count += 1; #if CACHESIZE > 0
} if(lexicon->flags & CACHEFLAG){
} /* check the cache to see if it belongs in cache */
if(cacheCheckOnPush(lexicon, RIVout)){
/* copy values into slot immediately after locations */ /* if the cache check returns 1, it has been dealth with in cache */
memcpy(locations+*count, values, (*count)*sizeof(int)); return 0;
}
return *count; }
}
int fLexPush(LEXICON* lexicon, denseRIV* output){ #endif
char pathString[200] = {0};
if(lexicon->flags & WRITEFLAG){
/* word data will be placed in a (new?) file under the lexicon directory /* push to the lexicon */
* in a file named after the word itself */ return fLexPush(lexicon, RIVout);
sprintf(pathString, "%s/%s", lexicon->lexName, output->name); }else{
/* free and return */
int saturation = saturationForStaging(output); free(RIVout);
return 0;
if( saturation < RIVSIZE/2){ }
FILE *lexWord = fopen(pathString, "wb"); }
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name); int saturationForStaging(denseRIV* output){
return 1;
} /* IOstagingSlot is a reserved block of heap memory used for this (and other)
fwrite(IOstagingSlot, (saturation*2)+5, sizeof(int), lexWord); * purposes. in this function, all of the metadata to be written along with a
fclose(lexWord); * sparse representation of the vector, will be laid into the IOstagingSlot
}else{ * in the necessary format for writing and reading again */
output->cached = 0; int* count = IOstagingSlot;
FILE *lexWord = fopen(pathString, "wb"); /* count, requires an 8 byte slot for reasons of compatibility between
if(!lexWord){ * dense and sparse. it takes up two integers (int* count and count+1); */
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name); *count = 0;
return 1; *(count+1) = 0;
} *(count+2) = output->frequency;
fwrite(((int*)&output->cached), sizeof(int), RIVSIZE+5, lexWord); *(count+3) = output->contextSize;
/* TODO fix this to allow magnitude to be changed to double easily */
fclose(lexWord); *(float*)(count+4) = output->magnitude;
}
/* locations will be laid in immediately after the metadata */
free(output); int* locations = IOstagingSlot+5;
/* values will be laid in *before* metadata, to be copied after locations,
return 0; * once the size of the values and locations arrays are known. there is,
} * by description of the stagingSlot, enough room for a
* completely saturated vector without conflict */
denseRIV* fLexPull(FILE* lexWord){ int* values = IOstagingSlot-RIVSIZE;;
denseRIV *output = calloc(1,sizeof(denseRIV)); int* locations_slider = locations;
size_t typeCheck; int* values_slider = values;
/* get metadata for vector */ for(int i=0; i<RIVSIZE; i++){
if(!fread(&typeCheck, 1, sizeof(size_t), lexWord)){
return NULL; /* act only on non-zeros */
} if(output->values[i]){
int flag = 0;
/* first value stored is the value count if sparse, and 0 if dense */ /* assign index to locations */
if (typeCheck){ *(locations_slider++) = i;
/* pull as sparseVector */
/*sparseRIV* temp = (sparseRIV*) (IOstagingSlot-(sizeof(sparseRIV)/sizeof(int)-IODISPLACEMENT)); /* assign value to values */
*(values_slider++) = output->values[i];
temp->count = typeCheck;
temp->locations = IOstagingSlot+5; /* track size of forming sparseRIV */
temp->values = temp->locations+temp->count; *count += 1;
}
if (fread(&(temp->frequency), sizeof(int), (typeCheck* 2)+3, lexWord) != typeCheck*2 + 3){ }
printf("vector read failure");
return NULL; /* copy values into slot immediately after locations */
}*/ memcpy(locations+*count, values, (*count)*sizeof(int));
sparseRIV temp; /* return number of non-zeros */
temp.count = typeCheck; return *count;
temp.locations = malloc(temp.count*2*sizeof(int)); }
temp.values = temp.locations+temp.count; int fLexPush(LEXICON* lexicon, denseRIV* output){
flag+= fread(&output->frequency, 1, sizeof(int), lexWord); char pathString[200] = {0};
flag += fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
flag+= fread(&output->magnitude, 1, sizeof(float), lexWord); /* word data will be placed in a (new?) file under the lexicon directory
flag += fread(temp.locations, temp.count, sizeof(int), lexWord); * in a file named after the word itself */
flag+= fread(temp.values, temp.count, sizeof(int), lexWord); sprintf(pathString, "%s/%s", lexicon->lexName, output->name);
/* saturationForStaging returns the number of non-zero elements in the vector
addS2D(output->values, temp); * and, in the process, places the data of the vector, in sparse format, in the
}else{ * preallocated "IOstagingSlot" */
/* typecheck is thrown away, just a flag in this case */ int saturation = saturationForStaging(output);
flag+= fread(&output->frequency, 1, sizeof(int), lexWord);
flag += fread(&output->contextSize, 1, sizeof(unsigned int), lexWord); /* if our vector is less than half full, it is lighter to save it as a sparseRIV */
flag +=fread(&output->magnitude, 1, sizeof(float), lexWord); if( saturation < RIVSIZE/2){
/*if(fread(&output->frequency, sizeof(int), RIVSIZE+3, lexWord) != RIVSIZE+3){
printf("vector read failure"); FILE *lexWord = fopen(pathString, "wb");
return NULL; if(!lexWord){
}*/ fprintf(stderr,"lexicon push has failed for word: %s\n", output->name);
} return 1;
}
/* IOstagingSlot is formatted for immediate writing */
output->cached = 0; fwrite(IOstagingSlot, (saturation*2)+5, sizeof(int), lexWord);
fclose(lexWord);
return output; }else{
/* the "cached" datapoint will be erased, a typecheck flag (0) for
} * the fLexPull function to know that this is a denseVector put
* in its place */
void signalSecure(int signum, siginfo_t *si, void* arg){ output->cached = 0;
while(rootCache){ FILE *lexWord = fopen(pathString, "wb");
if(cacheDump(rootCache->cache)){ if(!lexWord){
puts("cache dump failed, some lexicon data lost"); fprintf(stderr, "lexicon push has failed for word: %s\n", output->name);
} return 1;
rootCache = rootCache->next; }
/* from the type flag forward, all metadata is preformatted, we simpy write */
} fwrite(((int*)&output->cached), sizeof(int), RIVSIZE+5, lexWord);
signal(signum, SIG_DFL);
kill(getpid(), signum); fclose(lexWord);
} }
int cacheDump(denseRIV* *toDump){
/* and free the memory */
int flag = 0; free(output);
denseRIV* *toDump_slider = toDump;
#ifdef HASHCACHE return 0;
denseRIV* *toDump_stop = toDump+CACHESIZE; }
while(toDump_slider<toDump_stop){
if(*toDump_slider){ denseRIV* fLexPull(FILE* lexWord){
denseRIV *output = calloc(1,sizeof(denseRIV));
flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider); size_t typeCheck;
} /* the first 8 byte value in the file will be either 0 (indicating storage as a dense vector)
toDump_slider++; * or a positive number, the number of values in a sparse-vector */
} if(!fread(&typeCheck, 1, sizeof(size_t), lexWord)){
#else return NULL;
#ifdef SORTCACHE }
while(*toDump_slider){
flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider); /* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){ /* pull as sparseVector */
toDump_slider++;
/*create a sparseVector pointer, pointing to a prealloccated slot */
} sparseRIV* temp = (sparseRIV*)RIVKey.h_tempBlock;
#endif /* typecheck, non-zero, is the number of values in our vector */
#endif temp->count = typeCheck;
free(toDump); /* locations slot comes immediately after the magnitude */
temp->locations = (int*)&(temp->magnitude) + 1;
return flag; /* and values slot comes immediately after locations */
} temp->values = temp->locations+temp->count;
#endif
if (fread(&(temp->frequency), sizeof(int), (typeCheck* 2)+3, lexWord) != typeCheck*2 + 3){
printf("vector read failure");
return NULL;
}
/* add our temporary sparseVector to the empty denseVector, for output */
addS2D(output->values, *temp);
}else{ /* typecheck is thrown away, just a flag in this case */
/* read into our denseVector pre-formatted to fit */
if(fread(&output->frequency, sizeof(int), RIVSIZE+3, lexWord) != RIVSIZE+3){
printf("vector read failure");
return NULL;
}
}
return output;
}
/* if our data is cached, it cannot be allowed to be lost in event of an issue */
void signalSecure(int signum, siginfo_t *si, void* arg){
/* descend linked list */
while(rootCache){
/* dumping all caches contained */
if(cacheDump(rootCache->cache)){
fprintf(stderr, "cache dump failed, some lexicon data lost");
}
rootCache = rootCache->next;
}
/* end with normal behavior of error */
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
int cacheDump(denseRIV* *toDump){
/* flag will record if there are any errors and alert */
int flag = 0;
/* iterate through the elements of our cache */
denseRIV* *toDump_slider = toDump;
denseRIV* *toDump_stop = toDump+CACHESIZE;
while(toDump_slider<toDump_stop){
#ifdef HASHCACHE
/* if our cache is hashed, there may be null vectors to be skipped */
if(*toDump_slider){
flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
}
#else /* HASHCAVHE */
#ifdef SORTCACHE
/* if our cache is sorted, a null vector represents the end of the cache */
if(!*toDump_slider)break;
flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
#endif /* SORTCACHE */
#endif
toDump_slider++;
}
free(toDump);
return flag;
}
#endif /* RIV_LEXICON_H */
...@@ -6,10 +6,11 @@ ...@@ -6,10 +6,11 @@
#include <dirent.h> #include <dirent.h>
#include <error.h> #include <error.h>
#include <string.h> #include <string.h>
//#define HASHCACHE
#define RIVSIZE 50000 #define RIVSIZE 50000
#define NONZEROS 4 #define NONZEROS 4
#define CACHESIZE 27000 #define CACHESIZE 25000
#define SORTCACHE
#include "RIVtools.h" #include "RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context) //this program reads a directory full of files, and adds all context vectors (considering file as context)
...@@ -20,11 +21,11 @@ void addContext(denseRIV* lexRIV, sparseRIV context); ...@@ -20,11 +21,11 @@ void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString); void directoryGrind(char *rootString);
void lineGrind(char* textLine); void lineGrind(char* textLine);
LEXICON* lp; LEXICON* lp;
//int COUNTY = 0; int COUNTY = 0;
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
char pathString[1000]; char pathString[1000];
lp = lexOpen("lexicon", "rw"); lp = lexOpen("lexiconshitty", "r");
//we open the lexicon, if it does not yet exist, it will be created //we open the lexicon, if it does not yet exist, it will be created
...@@ -33,7 +34,6 @@ int main(int argc, char *argv[]){ ...@@ -33,7 +34,6 @@ int main(int argc, char *argv[]){
strcpy(pathString, argv[1]); strcpy(pathString, argv[1]);
strcat(pathString, "/"); strcat(pathString, "/");
//ensure that the targeted root directory exists //ensure that the targeted root directory exists
struct stat st; struct stat st;
if(stat(pathString, &st) == -1) { if(stat(pathString, &st) == -1) {
printf("directory doesn't seem to exist"); printf("directory doesn't seem to exist");
...@@ -79,8 +79,10 @@ void directoryGrind(char *rootString){ ...@@ -79,8 +79,10 @@ void directoryGrind(char *rootString){
//open a file within root directory //open a file within root directory
FILE *input = fopen(pathString, "r"); FILE *input = fopen(pathString, "r");
if(input){ if(input){
if(COUNTY++>1000) return;
//process this file and add it's data to lexicon //process this file and add it's data to lexicon
//fprintf(stderr, "***%d", COUNTY++); //fprintf(stderr, "***%d", COUNTY++);
fileGrind(input); fileGrind(input);
fclose(input); fclose(input);
...@@ -133,7 +135,10 @@ void lineGrind(char* textLine){ ...@@ -133,7 +135,10 @@ void lineGrind(char* textLine){
//we pull the vector corresponding to each word from the lexicon //we pull the vector corresponding to each word from the lexicon
//if it's a new word, lexPull returns a 0 vector //if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(lp, word); lexiconRIV= lexPull(lp, word);
if(!lexiconRIV){
printf("Fuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuucked");
continue;
}
//we add the context of this file to this wordVector //we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector); addContext(lexiconRIV, contextVector);
...@@ -150,20 +155,13 @@ void lineGrind(char* textLine){ ...@@ -150,20 +155,13 @@ void lineGrind(char* textLine){
} }
//free the heap allocated context vector data //free the heap allocated context vector data
free(contextVector.locations); free(contextVector.locations);
} }
void addContext(denseRIV* lexRIV, sparseRIV context){ void addContext(denseRIV* lexRIV, sparseRIV context){
//add context to the lexRIV, (using sparse-dense vector comparison) //add context to the lexRIV, (using sparse-dense vector comparison)
addS2D(lexRIV->values, context); sparseRIV thing = context;
addS2D(lexRIV->values, thing);
//log the "size" of the vector which was added //log the "size" of the vector which was added
//this is not directly necessary, but is useful metadata for some analises //this is not directly necessary, but is useful metadata for some analises
......
clean(){
while [ "$1" ]; do
./RIVread "$1"
shift
done
}
clean ../bookCleaner/cleanbooks/*
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#include "RIVtoolsCPUlinux.h"
void directoryToL2s(char *rootString);
int main(){
RIVInit();
char rootString[] = "lexicon/";
directoryToL2s(rootString);
}
void directoryToL2s(char *rootString){
sparseRIV fileRIV;
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
denseRIV temp = lexPull(pathString);
fileRIV = consolidateD2S(temp.values);
strcpy(fileRIV.name, pathString);
float count = fileRIV.count;
printf("%s, saturation: %f\n", fileRIV.name, count);
fclose(input);
free(temp.values);
//free(fileRIV.locations);
}
}
}
#include <stdio.h>
#include "RIVaccessories.h"
#include <time.h>
int main(){
struct treenode* root = stemTreeSetup();
char word[100];
char* stem;
clock_t start, end;
puts("tree ready");
while(1){
scanf("%s", word);
start = clock();
stem = treeSearch(root, word) ;
end = clock();
if(stem){
puts(stem);
}else{
puts("no entry");
}
printf("took: %lf\n", (double)(end-start)/CLOCKS_PER_SEC);
}
}
import pymongo
from pymongo import MongoClient
def dbSetup():
client = MongoClient("mongodb://etcart:Argelfraster1@ds261969.mlab.com:61969/rivwordnet")
database = client.rivwordnet
collection = database.stems
collection.create_index("from")
return collection
def dbPost(wordset, collection):
if not len(wordset):
return
posts = []
for key, value in wordset.iteritems():
post = {"from": key, "to": value}
posts.append(post)
collection.insert_many(posts)
def cleanDbSetup():
client = MongoClient("mongodb://etcart:Argelfraster1@ds163119.mlab.com:63119/rivetcleandocs")
database = client.rivetcleandocs
collection = database.cleaned
collection.create_index("file")
return collection
def dbPostCleaned(text, file, collection):
if not len(text):
return
document = {
"text": text,
"file": file,
}
collection.insert_one(document)
def dbGet(words, collection):
if mebewords:
return mebeword["to"]
else:
return 0
\ No newline at end of file
#include <stdio.h>
#include "../RIVaccessories.h"
int configInsert(struct treenode* node, char* letter, int treeSize);
int stemTreeConfig();
int main(){
int count = stemTreeConfig();
printf("%d", count);
}
int configInsert(struct treenode* node, char* letter, int treeSize){
node->downstream++;
if(*(letter)){
if(!node->links[*(letter)-'a']){
treeSize++;
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
}
return configInsert(node->links[*(letter)-'a'], letter+1, treeSize);
}else{
return treeSize;
}
}
int stemTreeConfig(){
int treeSize = 1;
FILE* wordFile = fopen("wordset.txt", "r");
if(!wordFile){
printf("no wordnet file");
return 0;
}
struct treenode* rootNode = calloc(1, sizeof(struct treenode));
char word[100];
char* stem = (char*)stemset;
int displacement;
while(fscanf(wordFile, "%s", word)){
sscanf(stem, "%*s%n", &displacement);
stem[displacement] = '\0';
treeSize = configInsert(rootNode, word, treeSize);
if(feof(wordFile)){
break;
}
stem += displacement+1;
}
fclose(wordFile);
return treeSize;
}
#include <stdio.h>
#include "../RIVaccessories.h"
int main(){
int count = stemTreeConfig();
printf("%d", count);
}
import dbtools
from subprocess import call
collection = dbtools.dbSetup()
preset = collection.find()
set = {}
for doc in preset:
set[doc["from"]] = doc["to"]
words = [];
stems = [];
for key, value in set.iteritems():
words.append(key);
stems.append(value);
wordFILE = open("wordset.txt", "w")
wordFILE.write(' '.join(words));
wordFILE.close()
stemFILE = open("stemset.h", "w")
finalOut = 'char stemset[] = "' + ' '.join(stems) + ' ";'+'\nint treesize = '
stemFILE.write(finalOut + '0;')
stemFILE.close()
tempfile = open("tempfile.txt", "w")
call(["gcc", "stemconf.c","-o", "stemconfig"])
call(["./stemconfig"], stdout=tempfile)
tempfile.close()
tempfile = open("tempfile.txt", "r")
treesize = tempfile.read();
finalOut = finalOut + treesize + ';'
stemFile = open("stemset.h", "w")
stemFile.write(finalOut)
stemFile.close;
This source diff could not be displayed because it is too large. You can view the blob instead.
279920
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
#include <stdio.h>
#include "RIVtools.h"
int main(){
struct treenode* root = stemTreeSetup();
char word[100];
char* stem;
while(1){
while(*word != '1'){
scanf("%s", word);
stem = treeSearch(root, word);
if(stem){
puts(stem);
}else{
puts("NULL return");
}
}
while(*word != '0'){
scanf("%s", word);
treecut(root, word);
}
}
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment