Commit fe20c6f5 by etcart

updated lots of stuff

parent 60856c1d
......@@ -4,6 +4,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "stemconfig/stemset.h"
struct treenode{
void* data;
......@@ -11,14 +12,14 @@ struct treenode{
struct treenode* links[26];
int downstream;
};
}*nextNode;
void stemInsert(struct treenode* node, char* letter, void* data);
int treecut(struct treenode* node, char* letter);
void stemInsert(struct treenode* node, char* letter, char* data);
void RIVinsert(struct treenode* node, char* letter, void* data);
void treeInsert(struct treenode* node, char* letter, void* data);
void* treeSearch(struct treenode* node, char* letter);
struct treenode* stemTreeSetup();
/*isWordClean filters words that contain non-letter characters, and
* upperCase letters, allowing only the '_' symbol through
*/
......@@ -64,27 +65,34 @@ int wordtoSeed(char* word){
return seed;
}
struct treenode* stemTreeSetup(){
FILE* netfile = fopen("stemnet2.txt", "r");
if(!netfile){
printf("no stemnet file");
FILE* wordFile = fopen("stemconfig/wordset.txt", "r");
if(!wordFile){
printf("no wordnet file");
return 0;
}
struct treenode* rootNode = calloc(1, sizeof(struct treenode));
struct treenode* rootNode = calloc(treesize, sizeof(struct treenode));
nextNode = rootNode+1;
char word[100];
char stem[100];
char* stem = (char*)stemset;
int displacement;
while(fscanf(wordFile, "%s", word)){
while(fscanf(netfile, "%s %s", word, stem)){
sscanf(stem, "%*s%n", &displacement);
stem[displacement] = '\0';
if(feof(netfile)){
break;
}
stemInsert(rootNode, word, stem);
if(feof(wordFile)){
break;
}
stem += displacement+1;
}
fclose(wordFile);
return rootNode;
}
void* treeSearch(struct treenode* node, char* letter){
......@@ -100,15 +108,15 @@ void* treeSearch(struct treenode* node, char* letter){
return node->data;
}
}
void RIVinsert(struct treenode* node, char* letter, void* data){
void stemInsert(struct treenode* node, char* letter, void* data){
node->downstream++;
if(*(letter)){
if(!node->links[*(letter)-'a']){
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
node->links[*(letter)-'a'] = nextNode++;
}
RIVinsert(node->links[*(letter)-'a'], letter+1, data);
treeInsert(node->links[*(letter)-'a'], letter+1, data);
}else{
......@@ -119,43 +127,46 @@ void RIVinsert(struct treenode* node, char* letter, void* data){
}
}
void stemInsert(struct treenode* node, char* letter, char* data){
void treeInsert(struct treenode* node, char* letter, void* data){
node->downstream++;
if(*(letter)){
if(!node->links[*(letter)-'a']){
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
}
stemInsert(node->links[*(letter)-'a'], letter+1, data);
treeInsert(node->links[*(letter)-'a'], letter+1, data);
}else{
if(node->data) return;
node->data = calloc(strlen(data)+1, sizeof(char));
node->data = data;
strcpy((char*)node->data, data);
}
}
int treecut(struct treenode* node, char* letter){
node->downstream--;
int flag;
//continue searching downstream if there is a letter
if(*(letter)){
if(node->links[*(letter)-'a']){
//propagate to next section
flag = treecut(node->links[*(letter)-'a'], letter+1);
//if next section returned a "cut" flag, 0 it out
if(flag){
node->links[*(letter)-'a'] = NULL;
}
}
if(!node->downstream){
//there are no more letters, we've reached our destination
}else{
free(node);
return 1;
node->data = NULL;
}
}else{
//this is on a branch that leads nowhere, free it and return "cut" flag
if(!node->downstream){
free(node);
return 1;
......@@ -164,5 +175,17 @@ int treecut(struct treenode* node, char* letter){
}
void destroyTree(struct treenode* node){
if(node->data) free(node->data);
for(int i=0; i<26; i++){
if(node->links[i]){
destroyTree(node->links[i]);
}
}
free(node);
}
#endif
No preview for this file type
File added
#include <stdio.h>
#define RIVSIZE 50000
#define CACHESIZE 20000
#include "RIVtools.h"
char* clean(char* word);
char* stemmy(struct treenode* searchRoot, char* word);
sparseRIV line2L3(char* text, struct treenode* searchRoot);
#define k 5
typedef char label[200];
struct RIVclass{
label name;
sparseRIV* set;
int setSize;
};
char* clean(char* word);
char* stemmy(struct treenode* searchRoot, char* word);
sparseRIV line2L3(char* text, struct treenode* searchRoot);
int kNearest(double* weights, struct RIVclass* classes, int classCount, denseRIV inQuestion);
LEXICON* lexicon;
int main(){
struct treenode* searchRoot = stemTreeSetup();
lexicon = lexOpen("consolidatedLexicon", "rx");
lexicon = lexOpen("lexiconEnron50-4", "rx");
int classNo = 0;
......@@ -25,18 +30,38 @@ int main(){
FILE* textSet = fopen("../../Downloads/labeledText.tsv", "r");
FILE* textSet = fopen("../../Downloads/trainingText.tsv", "r");
if(!textSet){
puts("no file");
return 1;
}
struct RIVclass* class;
struct RIVclass* class = 0;
char text[20000];
label className;
while(fscanf(textSet, "%s\t%s", text, className)){
//int j=0;
while(fscanf(textSet, "%[^\t]\t%[^\n]", text, className)){
//if(j++>100) break;
if(feof(textSet)) break;
char* labelTemp = strstr(*classNames, className);
if(!labelTemp){
sparseRIV temp = line2L3(text, searchRoot);
temp.magnitude = getMagnitudeSparse(temp);
if(temp.magnitude == 0){
printf("%s, empty\n", text);
continue;
}
//printf("%s, %s", text, className);
int i=0;
for(; i< classCount; i++){
if(!strcmp(className, classNames[i])){
classNo = i;
class = classes+classNo;
break;
}
}
if(i == classCount){
/* reinitialize the classnames with a new member */
classNames = realloc(classNames, (classCount+1)*sizeof(label));
strcpy(classNames[classCount], className);
......@@ -53,14 +78,10 @@ int main(){
classNo = classCount;
classCount++;
}else{
classNo = (labelTemp-*classNames);
class = classes+classNo;
}
class->set = realloc(class->set, (class->setSize+1) *sizeof(sparseRIV));
sparseRIV thing= line2L3(text, searchRoot);
sparseRIV thing= temp;
class->set[class->setSize] = thing;
class->setSize++;
......@@ -69,10 +90,71 @@ int main(){
for(int i=0; i<classCount; i++){
puts(classNames[i]);
puts(classes[i].name);
printf("%d\n\n", classes[i].setSize);
}
fclose(textSet);
textSet = fopen("../../Downloads/validationText.tsv", "r");
if(!textSet) return 1;
int won = 0;
int docTotal = 0;
//scanf("%d", &won);
//j=0;
while(fscanf(textSet, "%[^\t]\t%[^\n]", text, className)){
if(feof(textSet)) break;
//if(j++>30) break;
int i=0;
for(; i< classCount; i++){
if(!strcmp(className, classNames[i])){
classNo = i;
class = classes+classNo;
break;
}
}if(i == classCount){
printf("unclassifiable\n");
continue;
}
sparseRIV thing= line2L3(text, searchRoot);
if(thing.count ==0){
continue;
}
docTotal++;
denseRIV inQuestion = {0};
addS2D(inQuestion.values, thing);
inQuestion.magnitude = getMagnitudeDense(&inQuestion);
double weights[classCount];
int choice = kNearest(weights, classes, classCount, inQuestion);
if(choice == -1){
printf("classificationFailed");
}else{
//puts(text);
printf("survey says! %s ", className);
printf("your asnwer was...%d, %s\n", choice, classes[choice].name);
}
if(choice == classNo){
won++;
}
free(thing.locations);
}
printf("\n\n we got %d/%d ", won, docTotal);
for(int i=0; i<classCount; i++){
for(int j=0; j<classes[i].setSize; j++){
free(classes[i].set[j].locations);
}
free(classes[i].set);
}
free(classes);
free(classNames);
destroyTree(searchRoot);
lexClose(lexicon);
fclose(textSet);
return 0;
}
......@@ -132,24 +214,72 @@ sparseRIV line2L3(char* text, struct treenode* searchRoot){
continue;
}else{
//printf("%s, succesfully pulled\n", stem);
temp = consolidateD2S(wordRIV->values);
temp = normalize(*wordRIV, 10000);
//temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp);
free(temp.locations);
free(wordRIV);
//free(wordRIV);
lexPush(lexicon, wordRIV);
}
}
}
temp = consolidateD2S(accumulate.values);
return temp;
}
int kNearest(double* weights, struct RIVclass* classes, int classCount, denseRIV inQuestion){
int choice = -1;
memset(weights, 0, classCount*sizeof(double));
double distances[k] = {-2};
int labels[k] = {0};
int fill = 0;
for(int i=0; i<classCount; i++){
for(int j=0; j<classes[i].setSize; j++){
double cosine = cosCompare(inQuestion, classes[i].set[j]);
if(fill < k){
distances[fill] = cosine;
fill++;
continue;
}
for(int x = 0; x<k; x++){
}
if(cosine>distances[x]){
distances[x] = cosine;
labels[x] = i;
break;
}
}
}
}
double totalweight = 0;
for(int i=0; i<classCount; i++){
for(int j = 0; j<k; j++){
if(labels[j] == i){
weights[i] += distances[j];
totalweight += distances[j];
}
}
}
double tempmax = -2;
for(int i=0; i<classCount; i++){
weights[i] /= totalweight;
if(weights[i] > tempmax){
choice = i;
tempmax = weights[i];
}
}
return choice;
}
......
File added
......@@ -5,6 +5,7 @@
#include "RIVaccessories.h"
#include "assert.h"
/* these flags will be used by the lexicon to know its permissions and states */
#ifndef READFLAG
#define READFLAG 0x01
#endif
......@@ -21,60 +22,71 @@
#define CACHEFLAG 0x08
#endif
/* if user has specified neither hashed nor sorted cache we assume sorted
* hashed strategy is extremely CPU and memory light, but very inneffective
* at ensuring the most important vectors are cached. as such it is better
* optimized for RAMdisks and unusually fast SSDs. the sorted strategy
* is much more expensive for the CPU, but ensures the minimum possible
* hard-drive read writes far more effectively */
#ifndef SORTCACHE
#ifndef HASHCACHE
#define SORTCACHE
#endif
#endif
/* the LEXICON struct will be used similar to a FILE (as a pointer) which
* contains all metadata that a lexicon needs in order to be read and written to safely*/
typedef struct{
char lexName[100];
denseRIV* *cache;
struct cacheList* listPoint;
char flags;
#ifdef SORTCACHE
/* if our cache is sorted, we will need a search tree and a saturation */
struct treenode* treeRoot;
int cacheSaturation;
denseRIV* *cache_slider;
#endif /* SORTCACHE */
}LEXICON;
/* this will form a linked list of caches, so that all data can be safely dumped
* in event of an error, no matter how many or how strangely lexica have
* been opened and closed */
struct cacheList{
denseRIV* *cache;
struct cacheList* next;
struct cacheList* prev;
}*rootCache = NULL;
#define IODISPLACEMENT (sizeof(((sparseRIV*)0)->count)\
+ sizeof(((sparseRIV*)0)->frequency)\
+ sizeof(((sparseRIV*)0)->contextSize)\
+ sizeof(((sparseRIV*)0)->magnitude))\
/ sizeof(int)
int* IOstagingSlot = RIVKey.h_tempBlock+RIVSIZE; //#TODO format this better
/* IOstagingSlot is used by fLexPush to preformat data to be written in a single
* fwrite() call. it has room for RIVSIZE integers behind it and 2*RIVSIZE
* integers ahead of it, which the function saturationForStaging() will need */
int* IOstagingSlot = RIVKey.h_tempBlock+RIVSIZE;
/* lexOpen is called to "open the lexicon", setting up for later calls to
* lexPush and lexPull. if the lexicon has not been opened before calls
* to these functions, their behavior can be unpredictable, most likely crashing
* lexOpen accepts flags: r, w, x.
* r: for reading, currently meaningless, it wont stop you reading if you don't have this
* w: for writing. if a readonly lexicon is "written to" no data will be saved in hardcopy
* although it will be cached if possible, so that later pulls will be optimized
* x: exclusive. will not accept new words, lexPull returns a NULL pointer
* and lexPush simply frees any word which is not already in the lexicon
*/
LEXICON* lexOpen(const char* lexName, const char* flags);
/* lexClose should always be called after the last lex push or lex pull call
* if the lexicon is left open, some vector data may be lost due to
* un-flushed RIV cache
* un-flushed RIV cache. also frees up data, memory leaks if lexicon is not closed
*/
void lexClose(LEXICON*);
/* both lexPush and lexPull must be called *after* the lexOpen() function
* and after using them the lexClose() function must be called to ensure
* data security */
* data security (only after the final push or pull, not regularly during operation */
/* lexPush writes a denseRIV to the lexicon for permanent storage */
int lexPush(LEXICON* lexicon, denseRIV* RIVout);
/* cacheCheckOnPush tests the state of this vector in our lexicon cache
* and returns 1 on "success" indicating cache storage and no need to push to file
* or returns 0 on "failure" indicating that the vector need be pushed to file
*/
int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout);
/* lexPull reads a denseRIV from the lexicon, under "word"
* if the file does not exist, it creates a 0 vector with the name of word
* lexPull returns a denseRIV *pointer* because its data must be tracked
......@@ -82,6 +94,12 @@ int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout);
*/
denseRIV* lexPull(LEXICON* lexicon, char* word);
/* cacheCheckOnPush tests the state of this vector in our lexicon cache
* and returns 1 on "success" indicating cache storage and no need to push to file
* or returns 0 on "failure" indicating that the vector need be pushed to file
*/
int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout);
/* cacheCheckonPull checks if the word's vector is stored in cache,
* and returns a pointer to that vector on success
* or returns a NULL pointer if the word is not cached, indicating a need
......@@ -96,8 +114,8 @@ denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word);
*/
int fLexPush(LEXICON* lexicon, denseRIV* RIVout);
/* flexPull pulls data directly from a file and converts it (if necessary)
* to a denseRIV. function is called by "lexPull" which is what users
/* flexPull pulls data directly from a file and outputs it as a denseRIV.
* function is called by "lexPull" which is what users
* should actually use. lexPull, unlike FlexPull, has cache logic under
* the hood for speed and harddrive optimization
*/
......@@ -105,6 +123,7 @@ denseRIV* fLexPull(FILE* lexWord);
/* redefines signal behavior to protect cached data against seg-faults etc*/
void signalSecure(int signum, siginfo_t *si, void* arg);
int cacheDump(denseRIV* *toDump);
/* used exclusively by flexpush to determine write-style (sparse or dense)
* and also formats the "IOstagingSlot" for fwrite as a single block if sparse
......@@ -125,6 +144,7 @@ LEXICON* lexOpen(const char* lexName, const char* flags){
if (stat(lexName, &st) == -1) {
mkdir(lexName, 0777);
}
/* flag for writing*/
output->flags |= WRITEFLAG;
}else if(r){
/* if set to read and not write, return null if lexicon does not exist */
......@@ -132,33 +152,35 @@ LEXICON* lexOpen(const char* lexName, const char* flags){
free(output);
return NULL;
}
/* flag for reading */
output->flags |= READFLAG;
}
/* if not set to exclusive, set the inclusive flag */
if(!x){
/* flag inclusive (will return unknown words as 0 vector */
output->flags |= INCFLAG;
}
/* record the name of the lexicon */
strcpy(output->lexName, lexName);
#if CACHESIZE > 0
output->cache = calloc(CACHESIZE, sizeof(denseRIV*));
if(r && w){
//#TODO include hash vs sort cache logic flags
/* if we will be reading and writing the same lexicon, setup a
* cache for this lexicon to speed up rewrites */
struct cacheList* newCache = calloc(1, sizeof(struct cacheList));
#ifdef HASHCACHE
newCache->cache = calloc(CACHESIZE, sizeof(denseRIV*));
#else
#ifdef SORTCACHE
newCache->cache = calloc(CACHESIZE+1, sizeof(denseRIV*));
/* a sorted cache needs a search tree for finding RIVs by name */
output->treeRoot = calloc(1, sizeof(struct treenode));
#endif
#endif
output->cacheSaturation = 0;
output->cache_slider = output->cache+CACHESIZE;
#endif /* SORTCACHE */
/* flag cached ?? */
output->flags |= CACHEFLAG;
if(w){
/* setup cache-list element for break dumping */
struct cacheList* newCache = calloc(1, sizeof(struct cacheList));
newCache->cache = output->cache;
output->cache = newCache->cache;
newCache->next = rootCache;
if(rootCache){
rootCache->prev = newCache;
......@@ -174,14 +196,15 @@ LEXICON* lexOpen(const char* lexName, const char* flags){
sigaction(i,&action,NULL);
}
}
#endif
#endif /* CACHESIZE > 0 */
return output;
}
void lexClose(LEXICON* toClose){
#if CACHESIZE>0
if(toClose->flags & CACHEFLAG){
if(toClose->flags & WRITEFLAG){
if(cacheDump(toClose->cache)){
puts("cache dump failed, some lexicon data was lost");
}
......@@ -203,6 +226,7 @@ void lexClose(LEXICON* toClose){
#if CACHESIZE > 0
denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
#ifdef HASHCACHE
/* we find which cache entry this word belongs in by simple hashing */
srand(wordtoSeed(word));
int hash = rand()%CACHESIZE;
if(lexicon->cache[hash]){
......@@ -214,7 +238,7 @@ denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
return NULL;
#endif
#ifdef SORTCACHE
/* use a treeSearch (found in RIVaccessories) to find the denseRIV* in the cache */
return treeSearch(lexicon->treeRoot, word);
#endif
......@@ -224,6 +248,7 @@ int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
/* if our RIV was cached already, no need to play with it */
if(RIVout->cached == lexicon){
/* return "success" the vector is already in cache and updated */
return 1;
}
#ifdef HASHCACHE
......@@ -235,6 +260,7 @@ int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
/* push to cache instead of file */
lexicon->cache[hash] = RIVout;
lexicon->cache[hash]->cached = lexicon;
/* return "success" */
return 1;
/*if the current RIV is more frequent than the RIV holding its slot */
}
......@@ -245,41 +271,54 @@ int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
lexicon->cache[hash] = RIVout;
lexicon->cache[hash]->cached = lexicon;
/* return "success" */
return 1;
}
return 0;
#endif /* HASHCACHE */
#ifdef SORTCACHE
denseRIV* *cache_slider = lexicon->cache;
while(*cache_slider){
if(RIVout->frequency > (*cache_slider)->frequency){
memcpy(cache_slider+1, cache_slider, CACHESIZE-(cache_slider-lexicon->cache));
if(lexicon->cache[CACHESIZE]){
fLexPush(lexicon, lexicon->cache[CACHESIZE]);
//remove tree element
treecut(lexicon->treeRoot, RIVout->name);
lexicon->cache[CACHESIZE] = NULL;
}
/* if the cache is not yet full, append this vector to the accumulating list */
if (lexicon->cacheSaturation < CACHESIZE){
RIVout->cached = lexicon;
*cache_slider = RIVout;
//add tree element
RIVinsert(lexicon->treeRoot, RIVout->name, RIVout);
lexicon->cache[lexicon->cacheSaturation] = RIVout;
treeInsert(lexicon->treeRoot, RIVout->name, RIVout);
lexicon->cacheSaturation = lexicon->cacheSaturation+1;
/* return "success" */
return 1;
}
}else{ /* if cache is full */
cache_slider++;
}
if(cache_slider-lexicon->cache < CACHESIZE){
RIVout->cached = lexicon;
*cache_slider = RIVout;
RIVinsert(lexicon->treeRoot, RIVout->name, RIVout);
//add tree element
denseRIV* toCheck = RIVout;
denseRIV* temp;
while(1){
if(lexicon->cache_slider == lexicon->cache){
lexicon->cache_slider += CACHESIZE;
}
(lexicon->cache_slider)--;
if(toCheck->frequency > (*lexicon->cache_slider)->frequency){
temp = (*lexicon->cache_slider);
(*lexicon->cache_slider) = toCheck;
toCheck = temp;
}else{
if(toCheck == RIVout){
return 0;
}else{
treecut(lexicon->treeRoot, toCheck->name);
fLexPush(lexicon, toCheck);
treeInsert(lexicon->treeRoot, RIVout->name, RIVout);
return 1;
}
break;
}
}
}
/* return "failure" */
return 0;
#endif /* SORTCACHE */
}
......@@ -309,6 +348,10 @@ denseRIV* lexPull(LEXICON* lexicon, char* word){
/* pull data from file */
output = fLexPull(lexWord);
if(!output){
return NULL;
}
/* record the "name" of the vector, as the word */
strcpy(output->name, word);
fclose(lexWord);
}else{
......@@ -317,13 +360,14 @@ denseRIV* lexPull(LEXICON* lexicon, char* word){
/*if file does not exist, return a 0 vector (word is new to the lexicon) */
output = calloc(1, sizeof(denseRIV));
/* record the "name" of the vector, as the word */
strcpy(output->name, word);
}
}else{
/*if lexicon is set to exclusive, will return a NULL pointer instead of a 0 vector */
return NULL;
}
}
return output;
}
......@@ -340,24 +384,39 @@ int lexPush(LEXICON* lexicon, denseRIV* RIVout){
#endif
if(lexicon->flags & WRITEFLAG){
/* push to the lexicon */
return fLexPush(lexicon, RIVout);
}else{
/* free and return */
free(RIVout);
return 0;
}
}
int saturationForStaging(denseRIV* output){
/* key/value pairs will be loaded to a worst-case sized temporary slot */
/* IOstagingSlot is a reserved block of heap memory used for this (and other)
* purposes. in this function, all of the metadata to be written along with a
* sparse representation of the vector, will be laid into the IOstagingSlot
* in the necessary format for writing and reading again */
int* count = IOstagingSlot;
/* count, requires an 8 byte slot for reasons of compatibility between
* dense and sparse. it takes up two integers (int* count and count+1); */
*count = 0;
*(count+1) = 0;
*(count+2) = output->frequency;
*(count+3) = output->contextSize;
/* TODO fix this to allow magnitude to be changed to double easily */
*(float*)(count+4) = output->magnitude;
/* locations will be laid in immediately after the metadata */
int* locations = IOstagingSlot+5;
/* values will be laid in *before* metadata, to be copied after locations,
* once the size of the values and locations arrays are known. there is,
* by description of the stagingSlot, enough room for a
* completely saturated vector without conflict */
int* values = IOstagingSlot-RIVSIZE;;
int* locations_slider = locations;
int* values_slider = values;
......@@ -380,6 +439,7 @@ int saturationForStaging(denseRIV* output){
/* copy values into slot immediately after locations */
memcpy(locations+*count, values, (*count)*sizeof(int));
/* return number of non-zeros */
return *count;
}
int fLexPush(LEXICON* lexicon, denseRIV* output){
......@@ -389,29 +449,39 @@ int fLexPush(LEXICON* lexicon, denseRIV* output){
* in a file named after the word itself */
sprintf(pathString, "%s/%s", lexicon->lexName, output->name);
/* saturationForStaging returns the number of non-zero elements in the vector
* and, in the process, places the data of the vector, in sparse format, in the
* preallocated "IOstagingSlot" */
int saturation = saturationForStaging(output);
/* if our vector is less than half full, it is lighter to save it as a sparseRIV */
if( saturation < RIVSIZE/2){
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
fprintf(stderr,"lexicon push has failed for word: %s\n", output->name);
return 1;
}
/* IOstagingSlot is formatted for immediate writing */
fwrite(IOstagingSlot, (saturation*2)+5, sizeof(int), lexWord);
fclose(lexWord);
}else{
/* the "cached" datapoint will be erased, a typecheck flag (0) for
* the fLexPull function to know that this is a denseVector put
* in its place */
output->cached = 0;
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
fprintf(stderr, "lexicon push has failed for word: %s\n", output->name);
return 1;
}
/* from the type flag forward, all metadata is preformatted, we simpy write */
fwrite(((int*)&output->cached), sizeof(int), RIVSIZE+5, lexWord);
fclose(lexWord);
}
/* and free the memory */
free(output);
return 0;
......@@ -420,91 +490,85 @@ int fLexPush(LEXICON* lexicon, denseRIV* output){
denseRIV* fLexPull(FILE* lexWord){
denseRIV *output = calloc(1,sizeof(denseRIV));
size_t typeCheck;
/* get metadata for vector */
/* the first 8 byte value in the file will be either 0 (indicating storage as a dense vector)
* or a positive number, the number of values in a sparse-vector */
if(!fread(&typeCheck, 1, sizeof(size_t), lexWord)){
return NULL;
}
int flag = 0;
/* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){
/* pull as sparseVector */
/*sparseRIV* temp = (sparseRIV*) (IOstagingSlot-(sizeof(sparseRIV)/sizeof(int)-IODISPLACEMENT));
if (typeCheck){ /* pull as sparseVector */
/*create a sparseVector pointer, pointing to a prealloccated slot */
sparseRIV* temp = (sparseRIV*)RIVKey.h_tempBlock;
/* typecheck, non-zero, is the number of values in our vector */
temp->count = typeCheck;
temp->locations = IOstagingSlot+5;
/* locations slot comes immediately after the magnitude */
temp->locations = (int*)&(temp->magnitude) + 1;
/* and values slot comes immediately after locations */
temp->values = temp->locations+temp->count;
if (fread(&(temp->frequency), sizeof(int), (typeCheck* 2)+3, lexWord) != typeCheck*2 + 3){
printf("vector read failure");
return NULL;
}*/
sparseRIV temp;
temp.count = typeCheck;
temp.locations = malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
flag+= fread(&output->frequency, 1, sizeof(int), lexWord);
flag += fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
flag+= fread(&output->magnitude, 1, sizeof(float), lexWord);
flag += fread(temp.locations, temp.count, sizeof(int), lexWord);
flag+= fread(temp.values, temp.count, sizeof(int), lexWord);
}
/* add our temporary sparseVector to the empty denseVector, for output */
addS2D(output->values, *temp);
}else{ /* typecheck is thrown away, just a flag in this case */
addS2D(output->values, temp);
}else{
/* typecheck is thrown away, just a flag in this case */
flag+= fread(&output->frequency, 1, sizeof(int), lexWord);
flag += fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
flag +=fread(&output->magnitude, 1, sizeof(float), lexWord);
/*if(fread(&output->frequency, sizeof(int), RIVSIZE+3, lexWord) != RIVSIZE+3){
/* read into our denseVector pre-formatted to fit */
if(fread(&output->frequency, sizeof(int), RIVSIZE+3, lexWord) != RIVSIZE+3){
printf("vector read failure");
return NULL;
}*/
}
output->cached = 0;
}
return output;
}
/* if our data is cached, it cannot be allowed to be lost in event of an issue */
void signalSecure(int signum, siginfo_t *si, void* arg){
/* descend linked list */
while(rootCache){
/* dumping all caches contained */
if(cacheDump(rootCache->cache)){
puts("cache dump failed, some lexicon data lost");
fprintf(stderr, "cache dump failed, some lexicon data lost");
}
rootCache = rootCache->next;
}
/* end with normal behavior of error */
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
int cacheDump(denseRIV* *toDump){
/* flag will record if there are any errors and alert */
int flag = 0;
/* iterate through the elements of our cache */
denseRIV* *toDump_slider = toDump;
#ifdef HASHCACHE
denseRIV* *toDump_stop = toDump+CACHESIZE;
while(toDump_slider<toDump_stop){
#ifdef HASHCACHE
/* if our cache is hashed, there may be null vectors to be skipped */
if(*toDump_slider){
flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
}
toDump_slider++;
}
#else
#else /* HASHCAVHE */
#ifdef SORTCACHE
while(*toDump_slider){
/* if our cache is sorted, a null vector represents the end of the cache */
if(!*toDump_slider)break;
flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
#endif /* SORTCACHE */
#endif
toDump_slider++;
}
#endif
#endif
free(toDump);
return flag;
}
#endif
#endif /* RIV_LEXICON_H */
......@@ -6,10 +6,11 @@
#include <dirent.h>
#include <error.h>
#include <string.h>
//#define HASHCACHE
#define RIVSIZE 50000
#define NONZEROS 4
#define CACHESIZE 27000
#define CACHESIZE 25000
#define SORTCACHE
#include "RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
......@@ -20,11 +21,11 @@ void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString);
void lineGrind(char* textLine);
LEXICON* lp;
//int COUNTY = 0;
int COUNTY = 0;
int main(int argc, char *argv[]){
char pathString[1000];
lp = lexOpen("lexicon", "rw");
lp = lexOpen("lexiconshitty", "r");
//we open the lexicon, if it does not yet exist, it will be created
......@@ -33,7 +34,6 @@ int main(int argc, char *argv[]){
strcpy(pathString, argv[1]);
strcat(pathString, "/");
//ensure that the targeted root directory exists
struct stat st;
if(stat(pathString, &st) == -1) {
printf("directory doesn't seem to exist");
......@@ -79,8 +79,10 @@ void directoryGrind(char *rootString){
//open a file within root directory
FILE *input = fopen(pathString, "r");
if(input){
if(COUNTY++>1000) return;
//process this file and add it's data to lexicon
//fprintf(stderr, "***%d", COUNTY++);
fileGrind(input);
fclose(input);
......@@ -133,7 +135,10 @@ void lineGrind(char* textLine){
//we pull the vector corresponding to each word from the lexicon
//if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(lp, word);
if(!lexiconRIV){
printf("Fuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuucked");
continue;
}
//we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector);
......@@ -150,20 +155,13 @@ void lineGrind(char* textLine){
}
//free the heap allocated context vector data
free(contextVector.locations);
}
void addContext(denseRIV* lexRIV, sparseRIV context){
//add context to the lexRIV, (using sparse-dense vector comparison)
addS2D(lexRIV->values, context);
sparseRIV thing = context;
addS2D(lexRIV->values, thing);
//log the "size" of the vector which was added
//this is not directly necessary, but is useful metadata for some analises
......
clean(){
while [ "$1" ]; do
./RIVread "$1"
shift
done
}
clean ../bookCleaner/cleanbooks/*
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#include "RIVtoolsCPUlinux.h"
void directoryToL2s(char *rootString);
int main(){
RIVInit();
char rootString[] = "lexicon/";
directoryToL2s(rootString);
}
void directoryToL2s(char *rootString){
sparseRIV fileRIV;
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
denseRIV temp = lexPull(pathString);
fileRIV = consolidateD2S(temp.values);
strcpy(fileRIV.name, pathString);
float count = fileRIV.count;
printf("%s, saturation: %f\n", fileRIV.name, count);
fclose(input);
free(temp.values);
//free(fileRIV.locations);
}
}
}
#include <stdio.h>
#include "RIVaccessories.h"
#include <time.h>
int main(){
struct treenode* root = stemTreeSetup();
char word[100];
char* stem;
clock_t start, end;
puts("tree ready");
while(1){
scanf("%s", word);
start = clock();
stem = treeSearch(root, word) ;
end = clock();
if(stem){
puts(stem);
}else{
puts("no entry");
}
printf("took: %lf\n", (double)(end-start)/CLOCKS_PER_SEC);
}
}
import pymongo
from pymongo import MongoClient
def dbSetup():
client = MongoClient("mongodb://etcart:Argelfraster1@ds261969.mlab.com:61969/rivwordnet")
database = client.rivwordnet
collection = database.stems
collection.create_index("from")
return collection
def dbPost(wordset, collection):
if not len(wordset):
return
posts = []
for key, value in wordset.iteritems():
post = {"from": key, "to": value}
posts.append(post)
collection.insert_many(posts)
def cleanDbSetup():
client = MongoClient("mongodb://etcart:Argelfraster1@ds163119.mlab.com:63119/rivetcleandocs")
database = client.rivetcleandocs
collection = database.cleaned
collection.create_index("file")
return collection
def dbPostCleaned(text, file, collection):
if not len(text):
return
document = {
"text": text,
"file": file,
}
collection.insert_one(document)
def dbGet(words, collection):
if mebewords:
return mebeword["to"]
else:
return 0
\ No newline at end of file
#include <stdio.h>
#include "../RIVaccessories.h"
int configInsert(struct treenode* node, char* letter, int treeSize);
int stemTreeConfig();
int main(){
int count = stemTreeConfig();
printf("%d", count);
}
int configInsert(struct treenode* node, char* letter, int treeSize){
node->downstream++;
if(*(letter)){
if(!node->links[*(letter)-'a']){
treeSize++;
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
}
return configInsert(node->links[*(letter)-'a'], letter+1, treeSize);
}else{
return treeSize;
}
}
int stemTreeConfig(){
int treeSize = 1;
FILE* wordFile = fopen("wordset.txt", "r");
if(!wordFile){
printf("no wordnet file");
return 0;
}
struct treenode* rootNode = calloc(1, sizeof(struct treenode));
char word[100];
char* stem = (char*)stemset;
int displacement;
while(fscanf(wordFile, "%s", word)){
sscanf(stem, "%*s%n", &displacement);
stem[displacement] = '\0';
treeSize = configInsert(rootNode, word, treeSize);
if(feof(wordFile)){
break;
}
stem += displacement+1;
}
fclose(wordFile);
return treeSize;
}
#include <stdio.h>
#include "../RIVaccessories.h"
int main(){
int count = stemTreeConfig();
printf("%d", count);
}
import dbtools
from subprocess import call
collection = dbtools.dbSetup()
preset = collection.find()
set = {}
for doc in preset:
set[doc["from"]] = doc["to"]
words = [];
stems = [];
for key, value in set.iteritems():
words.append(key);
stems.append(value);
wordFILE = open("wordset.txt", "w")
wordFILE.write(' '.join(words));
wordFILE.close()
stemFILE = open("stemset.h", "w")
finalOut = 'char stemset[] = "' + ' '.join(stems) + ' ";'+'\nint treesize = '
stemFILE.write(finalOut + '0;')
stemFILE.close()
tempfile = open("tempfile.txt", "w")
call(["gcc", "stemconf.c","-o", "stemconfig"])
call(["./stemconfig"], stdout=tempfile)
tempfile.close()
tempfile = open("tempfile.txt", "r")
treesize = tempfile.read();
finalOut = finalOut + treesize + ';'
stemFile = open("stemset.h", "w")
stemFile.write(finalOut)
stemFile.close;
This source diff could not be displayed because it is too large. You can view the blob instead.
279920
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
#include <stdio.h>
#include "RIVtools.h"
int main(){
struct treenode* root = stemTreeSetup();
char word[100];
char* stem;
while(1){
while(*word != '1'){
scanf("%s", word);
stem = treeSearch(root, word);
if(stem){
puts(stem);
}else{
puts("NULL return");
}
}
while(*word != '0'){
scanf("%s", word);
treecut(root, word);
}
}
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment