diff options
author | Camil Staps | 2019-12-30 16:57:13 +0100 |
---|---|---|
committer | Camil Staps | 2020-01-03 19:06:02 +0100 |
commit | ad10a6467f76822b9289832eeb0d0ac2038b32d1 (patch) | |
tree | 1d78251ad49be1a744dc4573a1b3cf54e295e3a4 /tools | |
parent | Clarify incorrect parsing message when input is short (diff) |
Strip down for simple app for Greek verb λύω
Diffstat (limited to 'tools')
-rwxr-xr-x | tools/import_etcbc.py | 140 |
1 files changed, 0 insertions, 140 deletions
diff --git a/tools/import_etcbc.py b/tools/import_etcbc.py deleted file mode 100755 index 4d752e7..0000000 --- a/tools/import_etcbc.py +++ /dev/null @@ -1,140 +0,0 @@ -#!/usr/bin/env python3 - -from tf.fabric import Fabric -import csv -import re - -VERB_STARTID=13 -ROOT_STARTID=2 - -STEMS = { - 'qal': 'Qal', - 'hif': 'Hiphil', - 'piel': 'Piel', - 'nif': 'Niphal', - 'hit': 'Hitpael', - 'pual': 'Pual', - 'hof': 'Hophal' - } - -TENSES = { - 'perf': 'perfect', - 'impf': 'imperfect', - #'wayq': 'wayyiqtol', - 'ptca': 'participle active', - 'infc': 'infinitive construct', - 'impv': 'imperative', - 'ptcp': 'participle passive', - 'infa': 'infinitive absolute' - } - -PERSONS = {'p1': 1, 'p2': 2, 'p3': 3, 'unknown': None} -GENDERS = {'m': 'm', 'f': 'f', 'unkown': None} -NUMBERS = {'sg': 's', 'pl': 'p', 'unknown': None} - -class Root: - def __init__(self, n): - self.lex = F.lex_utf8.v(n) - - def __eq__(self, other): - return self.lex == other.lex - - def __hash__(self): - return hash(self.lex) - -class Verb: - def __init__(self, n): - self.n = n - verb = F.g_word_utf8.v(n) - if verb is None or '\u05c3' in verb or '\u05be' in verb: - raise ValueError('no text, sof pasuq or maqaf') - # strip accents - self.verb = re.sub(r'[^\u05b0-\u05bc\u05c1\u05c2\u05c7-\u05ea]', '', verb) - self.root = F.lex_utf8.v(n) - self.stem = STEMS[F.vs.v(n)] - self.tense = TENSES[F.vt.v(n)] - self.person = PERSONS[F.ps.v(n)] - self.gender = GENDERS[F.gn.v(n)] - self.number = NUMBERS[F.nu.v(n)] - self.loc = T.sectionFromNode(n) - - def unpointed_word(self): - return re.sub(r'[^\u05d0-\u05ea]', '', self.verb) - - def __eq__(self, other): - return self.unpointed_word() == other.unpointed_word() and \ - self.root == other.root and \ - self.stem == other.stem and \ - self.tense == other.tense and \ - self.person == other.person and \ - self.gender == other.gender and \ - self.number == other.number - - def __hash__(self): - return hash((self.unpointed_word(), self.root, self.stem, self.tense, - self.person, self.gender, self.number)) - -class Databank: - def __init__(self): - self.verbs = set() - self.roots = set() - - def add_root(self, root): - self.roots.add(root) - - def add_verb(self, verb): - self.verbs.add(verb) - -def handle(n, data): - if F.language.v(n) != 'hbo': # Ancient Hebrew - return - data.add_verb(Verb(n)) - data.add_root(Root(n)) - -def main(): - TF = Fabric( - modules=['hebrew/etcbc4c'], - locations='~/VersionControl/etcbc-data', - silent=True) - api = TF.load('language g_word_utf8 lex_utf8 vs vt gn nu ps', silent=True) - api.makeAvailableIn(globals()) - - data = Databank() - - for n in N(): - try: - handle(n, data) - except (KeyError, ValueError): - pass - - print(len(data.verbs), len(data.roots)) - - with open('etcbc-verbs.csv', 'w') as csvverbs: - verbwr = csv.writer(csvverbs, quoting=csv.QUOTE_MINIMAL) - #verbwr.writerow(['id', 'verb','root','stem','tense','person','gender','number','active']) - i = VERB_STARTID - for verb in data.verbs: - verbwr.writerow([ - i, - verb.verb, - verb.root, - verb.stem, - verb.tense, - verb.person if verb.person is not None else 'NULL', - verb.gender if verb.gender is not None else 'NULL', - verb.number if verb.number is not None else 'NULL', - 1 - ]) - i += 1 - - with open('etcbc-roots.csv', 'w') as csvroots: - rootwr = csv.writer(csvroots, quoting=csv.QUOTE_MINIMAL) - #rootwr.writerow(['id', 'root', 'root_kind_id']) - i = ROOT_STARTID - for root in data.roots: - rootwr.writerow([i, root.lex, 1]) - i += 1 - - -if __name__ == '__main__': - main() |