Commit fe20c6f5 by etcart

updated lots of stuff

parent 60856c1d
......@@ -4,6 +4,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "stemconfig/stemset.h"
struct treenode{
void* data;
......@@ -11,14 +12,14 @@ struct treenode{
struct treenode* links[26];
int downstream;
};
}*nextNode;
void stemInsert(struct treenode* node, char* letter, void* data);
int treecut(struct treenode* node, char* letter);
void stemInsert(struct treenode* node, char* letter, char* data);
void RIVinsert(struct treenode* node, char* letter, void* data);
void treeInsert(struct treenode* node, char* letter, void* data);
void* treeSearch(struct treenode* node, char* letter);
struct treenode* stemTreeSetup();
/*isWordClean filters words that contain non-letter characters, and
* upperCase letters, allowing only the '_' symbol through
*/
......@@ -64,27 +65,34 @@ int wordtoSeed(char* word){
return seed;
}
struct treenode* stemTreeSetup(){
FILE* netfile = fopen("stemnet2.txt", "r");
if(!netfile){
printf("no stemnet file");
FILE* wordFile = fopen("stemconfig/wordset.txt", "r");
if(!wordFile){
printf("no wordnet file");
return 0;
}
struct treenode* rootNode = calloc(1, sizeof(struct treenode));
struct treenode* rootNode = calloc(treesize, sizeof(struct treenode));
nextNode = rootNode+1;
char word[100];
char stem[100];
char* stem = (char*)stemset;
int displacement;
while(fscanf(wordFile, "%s", word)){
while(fscanf(netfile, "%s %s", word, stem)){
sscanf(stem, "%*s%n", &displacement);
stem[displacement] = '\0';
if(feof(netfile)){
break;
}
stemInsert(rootNode, word, stem);
if(feof(wordFile)){
break;
}
stem += displacement+1;
}
fclose(wordFile);
return rootNode;
}
void* treeSearch(struct treenode* node, char* letter){
......@@ -100,15 +108,15 @@ void* treeSearch(struct treenode* node, char* letter){
return node->data;
}
}
void RIVinsert(struct treenode* node, char* letter, void* data){
void stemInsert(struct treenode* node, char* letter, void* data){
node->downstream++;
if(*(letter)){
if(!node->links[*(letter)-'a']){
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
node->links[*(letter)-'a'] = nextNode++;
}
RIVinsert(node->links[*(letter)-'a'], letter+1, data);
treeInsert(node->links[*(letter)-'a'], letter+1, data);
}else{
......@@ -119,43 +127,46 @@ void RIVinsert(struct treenode* node, char* letter, void* data){
}
}
void stemInsert(struct treenode* node, char* letter, char* data){
void treeInsert(struct treenode* node, char* letter, void* data){
node->downstream++;
if(*(letter)){
if(!node->links[*(letter)-'a']){
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
}
stemInsert(node->links[*(letter)-'a'], letter+1, data);
treeInsert(node->links[*(letter)-'a'], letter+1, data);
}else{
if(node->data) return;
node->data = calloc(strlen(data)+1, sizeof(char));
node->data = data;
strcpy((char*)node->data, data);
}
}
int treecut(struct treenode* node, char* letter){
node->downstream--;
int flag;
//continue searching downstream if there is a letter
if(*(letter)){
if(node->links[*(letter)-'a']){
//propagate to next section
flag = treecut(node->links[*(letter)-'a'], letter+1);
//if next section returned a "cut" flag, 0 it out
if(flag){
node->links[*(letter)-'a'] = NULL;
}
}
if(!node->downstream){
//there are no more letters, we've reached our destination
}else{
free(node);
return 1;
node->data = NULL;
}
}else{
//this is on a branch that leads nowhere, free it and return "cut" flag
if(!node->downstream){
free(node);
return 1;
......@@ -164,5 +175,17 @@ int treecut(struct treenode* node, char* letter){
}
void destroyTree(struct treenode* node){
if(node->data) free(node->data);
for(int i=0; i<26; i++){
if(node->links[i]){
destroyTree(node->links[i]);
}
}
free(node);
}
#endif
No preview for this file type
File added
#include <stdio.h>
#define RIVSIZE 50000
#define CACHESIZE 20000
#include "RIVtools.h"
char* clean(char* word);
char* stemmy(struct treenode* searchRoot, char* word);
sparseRIV line2L3(char* text, struct treenode* searchRoot);
#define k 5
typedef char label[200];
struct RIVclass{
label name;
sparseRIV* set;
int setSize;
};
char* clean(char* word);
char* stemmy(struct treenode* searchRoot, char* word);
sparseRIV line2L3(char* text, struct treenode* searchRoot);
int kNearest(double* weights, struct RIVclass* classes, int classCount, denseRIV inQuestion);
LEXICON* lexicon;
int main(){
struct treenode* searchRoot = stemTreeSetup();
lexicon = lexOpen("consolidatedLexicon", "rx");
lexicon = lexOpen("lexiconEnron50-4", "rx");
int classNo = 0;
......@@ -25,18 +30,38 @@ int main(){
FILE* textSet = fopen("../../Downloads/labeledText.tsv", "r");
FILE* textSet = fopen("../../Downloads/trainingText.tsv", "r");
if(!textSet){
puts("no file");
return 1;
}
struct RIVclass* class;
struct RIVclass* class = 0;
char text[20000];
label className;
while(fscanf(textSet, "%s\t%s", text, className)){
//int j=0;
while(fscanf(textSet, "%[^\t]\t%[^\n]", text, className)){
//if(j++>100) break;
if(feof(textSet)) break;
char* labelTemp = strstr(*classNames, className);
if(!labelTemp){
sparseRIV temp = line2L3(text, searchRoot);
temp.magnitude = getMagnitudeSparse(temp);
if(temp.magnitude == 0){
printf("%s, empty\n", text);
continue;
}
//printf("%s, %s", text, className);
int i=0;
for(; i< classCount; i++){
if(!strcmp(className, classNames[i])){
classNo = i;
class = classes+classNo;
break;
}
}
if(i == classCount){
/* reinitialize the classnames with a new member */
classNames = realloc(classNames, (classCount+1)*sizeof(label));
strcpy(classNames[classCount], className);
......@@ -53,14 +78,10 @@ int main(){
classNo = classCount;
classCount++;
}else{
classNo = (labelTemp-*classNames);
class = classes+classNo;
}
class->set = realloc(class->set, (class->setSize+1) *sizeof(sparseRIV));
sparseRIV thing= line2L3(text, searchRoot);
sparseRIV thing= temp;
class->set[class->setSize] = thing;
class->setSize++;
......@@ -69,10 +90,71 @@ int main(){
for(int i=0; i<classCount; i++){
puts(classNames[i]);
puts(classes[i].name);
printf("%d\n\n", classes[i].setSize);
}
fclose(textSet);
textSet = fopen("../../Downloads/validationText.tsv", "r");
if(!textSet) return 1;
int won = 0;
int docTotal = 0;
//scanf("%d", &won);
//j=0;
while(fscanf(textSet, "%[^\t]\t%[^\n]", text, className)){
if(feof(textSet)) break;
//if(j++>30) break;
int i=0;
for(; i< classCount; i++){
if(!strcmp(className, classNames[i])){
classNo = i;
class = classes+classNo;
break;
}
}if(i == classCount){
printf("unclassifiable\n");
continue;
}
sparseRIV thing= line2L3(text, searchRoot);
if(thing.count ==0){
continue;
}
docTotal++;
denseRIV inQuestion = {0};
addS2D(inQuestion.values, thing);
inQuestion.magnitude = getMagnitudeDense(&inQuestion);
double weights[classCount];
int choice = kNearest(weights, classes, classCount, inQuestion);
if(choice == -1){
printf("classificationFailed");
}else{
//puts(text);
printf("survey says! %s ", className);
printf("your asnwer was...%d, %s\n", choice, classes[choice].name);
}
if(choice == classNo){
won++;
}
free(thing.locations);
}
printf("\n\n we got %d/%d ", won, docTotal);
for(int i=0; i<classCount; i++){
for(int j=0; j<classes[i].setSize; j++){
free(classes[i].set[j].locations);
}
free(classes[i].set);
}
free(classes);
free(classNames);
destroyTree(searchRoot);
lexClose(lexicon);
fclose(textSet);
return 0;
}
......@@ -132,24 +214,72 @@ sparseRIV line2L3(char* text, struct treenode* searchRoot){
continue;
}else{
//printf("%s, succesfully pulled\n", stem);
temp = consolidateD2S(wordRIV->values);
temp = normalize(*wordRIV, 10000);
//temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp);
free(temp.locations);
free(wordRIV);
//free(wordRIV);
lexPush(lexicon, wordRIV);
}
}
}
temp = consolidateD2S(accumulate.values);
return temp;
}
int kNearest(double* weights, struct RIVclass* classes, int classCount, denseRIV inQuestion){
int choice = -1;
memset(weights, 0, classCount*sizeof(double));
double distances[k] = {-2};
int labels[k] = {0};
int fill = 0;
for(int i=0; i<classCount; i++){
for(int j=0; j<classes[i].setSize; j++){
double cosine = cosCompare(inQuestion, classes[i].set[j]);
if(fill < k){
distances[fill] = cosine;
fill++;
continue;
}
for(int x = 0; x<k; x++){
}
if(cosine>distances[x]){
distances[x] = cosine;
labels[x] = i;
break;
}
}
}
}
double totalweight = 0;
for(int i=0; i<classCount; i++){
for(int j = 0; j<k; j++){
if(labels[j] == i){
weights[i] += distances[j];
totalweight += distances[j];
}
}
}
double tempmax = -2;
for(int i=0; i<classCount; i++){
weights[i] /= totalweight;
if(weights[i] > tempmax){
choice = i;
tempmax = weights[i];
}
}
return choice;
}
......
File added
This diff is collapsed. Click to expand it.
......@@ -6,10 +6,11 @@
#include <dirent.h>
#include <error.h>
#include <string.h>
//#define HASHCACHE
#define RIVSIZE 50000
#define NONZEROS 4
#define CACHESIZE 27000
#define CACHESIZE 25000
#define SORTCACHE
#include "RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
......@@ -20,11 +21,11 @@ void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString);
void lineGrind(char* textLine);
LEXICON* lp;
//int COUNTY = 0;
int COUNTY = 0;
int main(int argc, char *argv[]){
char pathString[1000];
lp = lexOpen("lexicon", "rw");
lp = lexOpen("lexiconshitty", "r");
//we open the lexicon, if it does not yet exist, it will be created
......@@ -33,7 +34,6 @@ int main(int argc, char *argv[]){
strcpy(pathString, argv[1]);
strcat(pathString, "/");
//ensure that the targeted root directory exists
struct stat st;
if(stat(pathString, &st) == -1) {
printf("directory doesn't seem to exist");
......@@ -79,8 +79,10 @@ void directoryGrind(char *rootString){
//open a file within root directory
FILE *input = fopen(pathString, "r");
if(input){
if(COUNTY++>1000) return;
//process this file and add it's data to lexicon
//fprintf(stderr, "***%d", COUNTY++);
fileGrind(input);
fclose(input);
......@@ -133,7 +135,10 @@ void lineGrind(char* textLine){
//we pull the vector corresponding to each word from the lexicon
//if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(lp, word);
if(!lexiconRIV){
printf("Fuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuucked");
continue;
}
//we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector);
......@@ -150,20 +155,13 @@ void lineGrind(char* textLine){
}
//free the heap allocated context vector data
free(contextVector.locations);
}
void addContext(denseRIV* lexRIV, sparseRIV context){
//add context to the lexRIV, (using sparse-dense vector comparison)
addS2D(lexRIV->values, context);
sparseRIV thing = context;
addS2D(lexRIV->values, thing);
//log the "size" of the vector which was added
//this is not directly necessary, but is useful metadata for some analises
......
clean(){
while [ "$1" ]; do
./RIVread "$1"
shift
done
}
clean ../bookCleaner/cleanbooks/*
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#include "RIVtoolsCPUlinux.h"
void directoryToL2s(char *rootString);
int main(){
RIVInit();
char rootString[] = "lexicon/";
directoryToL2s(rootString);
}
void directoryToL2s(char *rootString){
sparseRIV fileRIV;
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
denseRIV temp = lexPull(pathString);
fileRIV = consolidateD2S(temp.values);
strcpy(fileRIV.name, pathString);
float count = fileRIV.count;
printf("%s, saturation: %f\n", fileRIV.name, count);
fclose(input);
free(temp.values);
//free(fileRIV.locations);
}
}
}
#include <stdio.h>
#include "RIVaccessories.h"
#include <time.h>
int main(){
struct treenode* root = stemTreeSetup();
char word[100];
char* stem;
clock_t start, end;
puts("tree ready");
while(1){
scanf("%s", word);
start = clock();
stem = treeSearch(root, word) ;
end = clock();
if(stem){
puts(stem);
}else{
puts("no entry");
}
printf("took: %lf\n", (double)(end-start)/CLOCKS_PER_SEC);
}
}
import pymongo
from pymongo import MongoClient
def dbSetup():
client = MongoClient("mongodb://etcart:Argelfraster1@ds261969.mlab.com:61969/rivwordnet")
database = client.rivwordnet
collection = database.stems
collection.create_index("from")
return collection
def dbPost(wordset, collection):
if not len(wordset):
return
posts = []
for key, value in wordset.iteritems():
post = {"from": key, "to": value}
posts.append(post)
collection.insert_many(posts)
def cleanDbSetup():
client = MongoClient("mongodb://etcart:Argelfraster1@ds163119.mlab.com:63119/rivetcleandocs")
database = client.rivetcleandocs
collection = database.cleaned
collection.create_index("file")
return collection
def dbPostCleaned(text, file, collection):
if not len(text):
return
document = {
"text": text,
"file": file,
}
collection.insert_one(document)
def dbGet(words, collection):
if mebewords:
return mebeword["to"]
else:
return 0
\ No newline at end of file
#include <stdio.h>
#include "../RIVaccessories.h"
int configInsert(struct treenode* node, char* letter, int treeSize);
int stemTreeConfig();
int main(){
int count = stemTreeConfig();
printf("%d", count);
}
int configInsert(struct treenode* node, char* letter, int treeSize){
node->downstream++;
if(*(letter)){
if(!node->links[*(letter)-'a']){
treeSize++;
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
}
return configInsert(node->links[*(letter)-'a'], letter+1, treeSize);
}else{
return treeSize;
}
}
int stemTreeConfig(){
int treeSize = 1;
FILE* wordFile = fopen("wordset.txt", "r");
if(!wordFile){
printf("no wordnet file");
return 0;
}
struct treenode* rootNode = calloc(1, sizeof(struct treenode));
char word[100];
char* stem = (char*)stemset;
int displacement;
while(fscanf(wordFile, "%s", word)){
sscanf(stem, "%*s%n", &displacement);
stem[displacement] = '\0';
treeSize = configInsert(rootNode, word, treeSize);
if(feof(wordFile)){
break;
}
stem += displacement+1;
}
fclose(wordFile);
return treeSize;
}
#include <stdio.h>
#include "../RIVaccessories.h"
int main(){
int count = stemTreeConfig();
printf("%d", count);
}
import dbtools
from subprocess import call
collection = dbtools.dbSetup()
preset = collection.find()
set = {}
for doc in preset:
set[doc["from"]] = doc["to"]
words = [];
stems = [];
for key, value in set.iteritems():
words.append(key);
stems.append(value);
wordFILE = open("wordset.txt", "w")
wordFILE.write(' '.join(words));
wordFILE.close()
stemFILE = open("stemset.h", "w")
finalOut = 'char stemset[] = "' + ' '.join(stems) + ' ";'+'\nint treesize = '
stemFILE.write(finalOut + '0;')
stemFILE.close()
tempfile = open("tempfile.txt", "w")
call(["gcc", "stemconf.c","-o", "stemconfig"])
call(["./stemconfig"], stdout=tempfile)
tempfile.close()
tempfile = open("tempfile.txt", "r")
treesize = tempfile.read();
finalOut = finalOut + treesize + ';'
stemFile = open("stemset.h", "w")
stemFile.write(finalOut)
stemFile.close;
This source diff could not be displayed because it is too large. You can view the blob instead.
279920
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
#include <stdio.h>
#include "RIVtools.h"
int main(){
struct treenode* root = stemTreeSetup();
char word[100];
char* stem;
while(1){
while(*word != '1'){
scanf("%s", word);
stem = treeSearch(root, word);
if(stem){
puts(stem);
}else{
puts("NULL return");
}
}
while(*word != '0'){
scanf("%s", word);
treecut(root, word);
}
}
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment