diff options
| author | Camil Staps | 2017-09-20 20:04:01 +0200 | 
|---|---|---|
| committer | Camil Staps | 2017-09-20 20:04:01 +0200 | 
| commit | f52e212ed7b61bbac32e2e0ee3e6f150e3d9b918 (patch) | |
| tree | bccd76aa9a89b48f24aab1063bdf3825a7f134c5 /tools | |
| parent | Use <select> for root again; fix checking answers (diff) | |
Simple ETCBC importer (#2 - still needs some work); requires case-sensitive collation
Diffstat (limited to 'tools')
| -rwxr-xr-x | tools/import_etcbc.py | 140 | 
1 files changed, 140 insertions, 0 deletions
| diff --git a/tools/import_etcbc.py b/tools/import_etcbc.py new file mode 100755 index 0000000..4d752e7 --- /dev/null +++ b/tools/import_etcbc.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +from tf.fabric import Fabric +import csv +import re + +VERB_STARTID=13 +ROOT_STARTID=2 + +STEMS = { +    'qal':  'Qal', +    'hif':  'Hiphil', +    'piel': 'Piel', +    'nif':  'Niphal', +    'hit':  'Hitpael', +    'pual': 'Pual', +    'hof':  'Hophal' +    } + +TENSES = { +    'perf': 'perfect', +    'impf': 'imperfect', +    #'wayq': 'wayyiqtol', +    'ptca': 'participle active', +    'infc': 'infinitive construct', +    'impv': 'imperative', +    'ptcp': 'participle passive', +    'infa': 'infinitive absolute' +    } + +PERSONS = {'p1': 1, 'p2': 2, 'p3': 3, 'unknown': None} +GENDERS = {'m': 'm', 'f': 'f', 'unkown': None} +NUMBERS = {'sg': 's', 'pl': 'p', 'unknown': None} + +class Root: +    def __init__(self, n): +        self.lex = F.lex_utf8.v(n) + +    def __eq__(self, other): +        return self.lex == other.lex + +    def __hash__(self): +        return hash(self.lex) + +class Verb: +    def __init__(self, n): +        self.n = n +        verb = F.g_word_utf8.v(n) +        if verb is None or '\u05c3' in verb or '\u05be' in verb: +            raise ValueError('no text, sof pasuq or maqaf') +        # strip accents +        self.verb = re.sub(r'[^\u05b0-\u05bc\u05c1\u05c2\u05c7-\u05ea]', '', verb) +        self.root = F.lex_utf8.v(n) +        self.stem = STEMS[F.vs.v(n)] +        self.tense = TENSES[F.vt.v(n)] +        self.person = PERSONS[F.ps.v(n)] +        self.gender = GENDERS[F.gn.v(n)] +        self.number = NUMBERS[F.nu.v(n)] +        self.loc = T.sectionFromNode(n) + +    def unpointed_word(self): +        return re.sub(r'[^\u05d0-\u05ea]', '', self.verb) + +    def __eq__(self, other): +        return self.unpointed_word() == other.unpointed_word() and \ +                self.root == other.root and \ +                self.stem == other.stem and \ +                self.tense == other.tense and \ +                self.person == other.person and \ +                self.gender == other.gender and \ +                self.number == other.number + +    def __hash__(self): +        return hash((self.unpointed_word(), self.root, self.stem, self.tense, +                     self.person, self.gender, self.number)) + +class Databank: +    def __init__(self): +        self.verbs = set() +        self.roots = set() + +    def add_root(self, root): +        self.roots.add(root) + +    def add_verb(self, verb): +        self.verbs.add(verb) + +def handle(n, data): +    if F.language.v(n) != 'hbo': # Ancient Hebrew +        return +    data.add_verb(Verb(n)) +    data.add_root(Root(n)) + +def main(): +    TF = Fabric( +        modules=['hebrew/etcbc4c'], +        locations='~/VersionControl/etcbc-data', +        silent=True) +    api = TF.load('language g_word_utf8 lex_utf8 vs vt gn nu ps', silent=True) +    api.makeAvailableIn(globals()) + +    data = Databank() + +    for n in N(): +        try: +            handle(n, data) +        except (KeyError, ValueError): +            pass + +    print(len(data.verbs), len(data.roots)) + +    with open('etcbc-verbs.csv', 'w') as csvverbs: +        verbwr = csv.writer(csvverbs, quoting=csv.QUOTE_MINIMAL) +        #verbwr.writerow(['id', 'verb','root','stem','tense','person','gender','number','active']) +        i = VERB_STARTID +        for verb in data.verbs: +            verbwr.writerow([ +                i, +                verb.verb, +                verb.root, +                verb.stem, +                verb.tense, +                verb.person if verb.person is not None else 'NULL', +                verb.gender if verb.gender is not None else 'NULL', +                verb.number if verb.number is not None else 'NULL', +                1 +                ]) +            i += 1 + +    with open('etcbc-roots.csv', 'w') as csvroots: +        rootwr = csv.writer(csvroots, quoting=csv.QUOTE_MINIMAL) +        #rootwr.writerow(['id', 'root', 'root_kind_id']) +        i = ROOT_STARTID +        for root in data.roots: +            rootwr.writerow([i, root.lex, 1]) +            i += 1 + + +if __name__ == '__main__': +    main() | 
