Commit 9b866efc by Paktalin

corrected regex for occurences search

parent 291fad22
This diff is collapsed. Click to expand it.
...@@ -8,7 +8,7 @@ def extract_verbs_occurences_from_articles(verbs, articles): ...@@ -8,7 +8,7 @@ def extract_verbs_occurences_from_articles(verbs, articles):
for i in tqdm(range(len(verbs))): for i in tqdm(range(len(verbs))):
# finish the pattern # finish the pattern
pattern = '.*\W' + verbs[8][i] + '.*' pattern = '^(.*\W)*' + verbs[8][i] + '(?!(mi|ja)).*$'
occurences = list(set([sentence + '.' for sentence in articles.split('.') if re.match(pattern, sentence)])) occurences = list(set([sentence + '.' for sentence in articles.split('.') if re.match(pattern, sentence)]))
verbs['occurences'][i] = filter_wrong_occurences(verbs.iloc[i], occurences) verbs['occurences'][i] = filter_wrong_occurences(verbs.iloc[i], occurences)
save_csv(verbs, "with_approximate_occurences.csv") save_csv(verbs, "with_approximate_occurences.csv")
...@@ -20,7 +20,7 @@ def filter_wrong_occurences(verb, occurences): ...@@ -20,7 +20,7 @@ def filter_wrong_occurences(verb, occurences):
for occurence in occurences: for occurence in occurences:
found = False found = False
for form in all_forms: for form in all_forms:
pattern = '.*\W'+form+'\W.*' pattern = '^(.*\W)*'+form+'(\W.*)*$'
if re.match(pattern, occurence): if re.match(pattern, occurence):
verified_occurences.append(occurence) verified_occurences.append(occurence)
found = True found = True
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment