diff options
| author | Camil Staps | 2019-12-30 16:57:13 +0100 | 
|---|---|---|
| committer | Camil Staps | 2020-01-03 19:06:02 +0100 | 
| commit | ad10a6467f76822b9289832eeb0d0ac2038b32d1 (patch) | |
| tree | 1d78251ad49be1a744dc4573a1b3cf54e295e3a4 /tools | |
| parent | Clarify incorrect parsing message when input is short (diff) | |
Strip down for simple app for Greek verb λύω
Diffstat (limited to 'tools')
| -rwxr-xr-x | tools/import_etcbc.py | 140 | 
1 files changed, 0 insertions, 140 deletions
| diff --git a/tools/import_etcbc.py b/tools/import_etcbc.py deleted file mode 100755 index 4d752e7..0000000 --- a/tools/import_etcbc.py +++ /dev/null @@ -1,140 +0,0 @@ -#!/usr/bin/env python3 - -from tf.fabric import Fabric -import csv -import re - -VERB_STARTID=13 -ROOT_STARTID=2 - -STEMS = { -    'qal':  'Qal', -    'hif':  'Hiphil', -    'piel': 'Piel', -    'nif':  'Niphal', -    'hit':  'Hitpael', -    'pual': 'Pual', -    'hof':  'Hophal' -    } - -TENSES = { -    'perf': 'perfect', -    'impf': 'imperfect', -    #'wayq': 'wayyiqtol', -    'ptca': 'participle active', -    'infc': 'infinitive construct', -    'impv': 'imperative', -    'ptcp': 'participle passive', -    'infa': 'infinitive absolute' -    } - -PERSONS = {'p1': 1, 'p2': 2, 'p3': 3, 'unknown': None} -GENDERS = {'m': 'm', 'f': 'f', 'unkown': None} -NUMBERS = {'sg': 's', 'pl': 'p', 'unknown': None} - -class Root: -    def __init__(self, n): -        self.lex = F.lex_utf8.v(n) - -    def __eq__(self, other): -        return self.lex == other.lex - -    def __hash__(self): -        return hash(self.lex) - -class Verb: -    def __init__(self, n): -        self.n = n -        verb = F.g_word_utf8.v(n) -        if verb is None or '\u05c3' in verb or '\u05be' in verb: -            raise ValueError('no text, sof pasuq or maqaf') -        # strip accents -        self.verb = re.sub(r'[^\u05b0-\u05bc\u05c1\u05c2\u05c7-\u05ea]', '', verb) -        self.root = F.lex_utf8.v(n) -        self.stem = STEMS[F.vs.v(n)] -        self.tense = TENSES[F.vt.v(n)] -        self.person = PERSONS[F.ps.v(n)] -        self.gender = GENDERS[F.gn.v(n)] -        self.number = NUMBERS[F.nu.v(n)] -        self.loc = T.sectionFromNode(n) - -    def unpointed_word(self): -        return re.sub(r'[^\u05d0-\u05ea]', '', self.verb) - -    def __eq__(self, other): -        return self.unpointed_word() == other.unpointed_word() and \ -                self.root == other.root and \ -                self.stem == other.stem and \ -                self.tense == other.tense and \ -                self.person == other.person and \ -                self.gender == other.gender and \ -                self.number == other.number - -    def __hash__(self): -        return hash((self.unpointed_word(), self.root, self.stem, self.tense, -                     self.person, self.gender, self.number)) - -class Databank: -    def __init__(self): -        self.verbs = set() -        self.roots = set() - -    def add_root(self, root): -        self.roots.add(root) - -    def add_verb(self, verb): -        self.verbs.add(verb) - -def handle(n, data): -    if F.language.v(n) != 'hbo': # Ancient Hebrew -        return -    data.add_verb(Verb(n)) -    data.add_root(Root(n)) - -def main(): -    TF = Fabric( -        modules=['hebrew/etcbc4c'], -        locations='~/VersionControl/etcbc-data', -        silent=True) -    api = TF.load('language g_word_utf8 lex_utf8 vs vt gn nu ps', silent=True) -    api.makeAvailableIn(globals()) - -    data = Databank() - -    for n in N(): -        try: -            handle(n, data) -        except (KeyError, ValueError): -            pass - -    print(len(data.verbs), len(data.roots)) - -    with open('etcbc-verbs.csv', 'w') as csvverbs: -        verbwr = csv.writer(csvverbs, quoting=csv.QUOTE_MINIMAL) -        #verbwr.writerow(['id', 'verb','root','stem','tense','person','gender','number','active']) -        i = VERB_STARTID -        for verb in data.verbs: -            verbwr.writerow([ -                i, -                verb.verb, -                verb.root, -                verb.stem, -                verb.tense, -                verb.person if verb.person is not None else 'NULL', -                verb.gender if verb.gender is not None else 'NULL', -                verb.number if verb.number is not None else 'NULL', -                1 -                ]) -            i += 1 - -    with open('etcbc-roots.csv', 'w') as csvroots: -        rootwr = csv.writer(csvroots, quoting=csv.QUOTE_MINIMAL) -        #rootwr.writerow(['id', 'root', 'root_kind_id']) -        i = ROOT_STARTID -        for root in data.roots: -            rootwr.writerow([i, root.lex, 1]) -            i += 1 - - -if __name__ == '__main__': -    main() | 
