Commit e449480f by Paktalin

finished the first version of filtering wrong occurences out

parent 80597f10
from util import save_csv, get_preprocessed_verbs, get_articles from util import save_csv, get_preprocessed_verbs, get_articles
from tqdm import tqdm from tqdm import tqdm
def extract_verbs_occurences_from_articles(verbs, articles): def extract_verbs_occurences_from_articles(verbs, articles):
verbs['occurences'] = '' verbs['occurences'] = ''
print("finding approximate verbs occurences") print("finding approximate verbs occurences")
# trial with the first verb
verb = verbs["common_substring"][0] verb = verbs["common_substring"][0]
occurences = [sentence + '.' for sentence in articles.split('.') if verb in sentence] spaced_verb = ' ' + verb
verbs['occurences'][0] = filter_wrong_occurences(verb, occurences) occurences = [sentence + '.' for sentence in articles.split('.') if spaced_verb in sentence]
verbs['occurences'][0] = filter_wrong_occurences(verbs.iloc[0], occurences)
# for i in tqdm(range(len(verbs))): # for i in tqdm(range(len(verbs))):
# verb = verbs["common_substring"][i] # verb = verbs["common_substring"][i]
...@@ -17,10 +19,48 @@ def extract_verbs_occurences_from_articles(verbs, articles): ...@@ -17,10 +19,48 @@ def extract_verbs_occurences_from_articles(verbs, articles):
# save_csv(verbs, "with_approximate_occurences.csv") # save_csv(verbs, "with_approximate_occurences.csv")
def filter_wrong_occurences(verb, occurences): def filter_wrong_occurences(verb, occurences):
print(verb) print("filtering wrong occurences")
all_forms = get_all_forms(verb)
for occurence in occurences:
found = False
for form in all_forms:
if form in occurence:
found = True
break
if not found:
occurences.remove(occurence)
occurences = list(set(occurences))
print(occurences) print(occurences)
def get_all_forms(verb):
all_forms = []
all_forms.extend(forms_from_ma(verb[0][:-2]))
all_forms.extend(forms_from_da(verb[1][:-2]))
all_forms.extend(forms_from_b(verb[2][:-1]))
all_forms.append(verb[6])
all_forms.append(verb[7])
return all_forms
def forms(root, endings):
return [root+ending+' ' for ending in endings] + [root+ending+'.' for ending in endings] + [root+ending+'?' for ending in endings] + [root+ending+'!' for ending in endings] + [root+ending+',' for ending in endings]
def forms_from_b(root):
endings = ['n', 'd', 'b', 'me', 'te', 'vad', '', 'ksin', 'ksid', 'ks', 'ksime', 'ksite']
return forms(root, endings)
def forms_from_ma(root):
endings = ['ma', 'mas', 'mast', 'maks', 'mata', 'v', 'vat', 'sin', 'sid', 's', 'sime', 'site']
return forms(root, endings)
def forms_from_da(root):
endings = ['da', 'gu', 'gem', 'ge', 'nuksin', 'nuks', 'nuksid', 'nuksime', 'nuksite', 'di', 'nuvat', 'davat', 'des', 'dav']
return forms(root, endings)
def forms_from_kse(root):
endings = ['kse', 'ks', 'gu', '', 'vat', 'v']
return forms(root, endings)
verbs = get_preprocessed_verbs() verbs = get_preprocessed_verbs()
articles = get_articles() articles = get_articles()
extract_occurences(verbs) extract_verbs_occurences_from_articles(verbs, articles)
\ No newline at end of file \ No newline at end of file
...@@ -32,7 +32,8 @@ def read_csv(path, sep, header): ...@@ -32,7 +32,8 @@ def read_csv(path, sep, header):
def get_articles(): def get_articles():
with open('articles.txt', 'r', encoding='utf-8') as articles: with open('articles.txt', 'r', encoding='utf-8') as articles:
return articles.read().replace('\n', '') articles_string = articles.read().replace('\n', '')
return articles_string
def get_preprocessed_verbs(): def get_preprocessed_verbs():
return read_csv("preprocessed_verbs.csv", ",", header=0) return read_csv("preprocessed_verbs.csv", ",", header=0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment