diff options
author | Camil Staps | 2017-09-20 20:04:01 +0200 |
---|---|---|
committer | Camil Staps | 2017-09-20 20:04:01 +0200 |
commit | f52e212ed7b61bbac32e2e0ee3e6f150e3d9b918 (patch) | |
tree | bccd76aa9a89b48f24aab1063bdf3825a7f134c5 | |
parent | Use <select> for root again; fix checking answers (diff) |
Simple ETCBC importer (#2 - still needs some work); requires case-sensitive collation
-rw-r--r-- | config/database.php | 2 | ||||
-rwxr-xr-x | tools/import_etcbc.py | 140 |
2 files changed, 141 insertions, 1 deletions
diff --git a/config/database.php b/config/database.php index fd22e8e..427cdbd 100644 --- a/config/database.php +++ b/config/database.php @@ -60,7 +60,7 @@ return [ 'username' => env('DB_USERNAME', 'forge'), 'password' => env('DB_PASSWORD', ''), 'charset' => 'utf8', - 'collation' => 'utf8_unicode_ci', + 'collation' => 'utf8_bin', 'prefix' => '', 'strict' => true, 'engine' => null, diff --git a/tools/import_etcbc.py b/tools/import_etcbc.py new file mode 100755 index 0000000..4d752e7 --- /dev/null +++ b/tools/import_etcbc.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +from tf.fabric import Fabric +import csv +import re + +VERB_STARTID=13 +ROOT_STARTID=2 + +STEMS = { + 'qal': 'Qal', + 'hif': 'Hiphil', + 'piel': 'Piel', + 'nif': 'Niphal', + 'hit': 'Hitpael', + 'pual': 'Pual', + 'hof': 'Hophal' + } + +TENSES = { + 'perf': 'perfect', + 'impf': 'imperfect', + #'wayq': 'wayyiqtol', + 'ptca': 'participle active', + 'infc': 'infinitive construct', + 'impv': 'imperative', + 'ptcp': 'participle passive', + 'infa': 'infinitive absolute' + } + +PERSONS = {'p1': 1, 'p2': 2, 'p3': 3, 'unknown': None} +GENDERS = {'m': 'm', 'f': 'f', 'unkown': None} +NUMBERS = {'sg': 's', 'pl': 'p', 'unknown': None} + +class Root: + def __init__(self, n): + self.lex = F.lex_utf8.v(n) + + def __eq__(self, other): + return self.lex == other.lex + + def __hash__(self): + return hash(self.lex) + +class Verb: + def __init__(self, n): + self.n = n + verb = F.g_word_utf8.v(n) + if verb is None or '\u05c3' in verb or '\u05be' in verb: + raise ValueError('no text, sof pasuq or maqaf') + # strip accents + self.verb = re.sub(r'[^\u05b0-\u05bc\u05c1\u05c2\u05c7-\u05ea]', '', verb) + self.root = F.lex_utf8.v(n) + self.stem = STEMS[F.vs.v(n)] + self.tense = TENSES[F.vt.v(n)] + self.person = PERSONS[F.ps.v(n)] + self.gender = GENDERS[F.gn.v(n)] + self.number = NUMBERS[F.nu.v(n)] + self.loc = T.sectionFromNode(n) + + def unpointed_word(self): + return re.sub(r'[^\u05d0-\u05ea]', '', self.verb) + + def __eq__(self, other): + return self.unpointed_word() == other.unpointed_word() and \ + self.root == other.root and \ + self.stem == other.stem and \ + self.tense == other.tense and \ + self.person == other.person and \ + self.gender == other.gender and \ + self.number == other.number + + def __hash__(self): + return hash((self.unpointed_word(), self.root, self.stem, self.tense, + self.person, self.gender, self.number)) + +class Databank: + def __init__(self): + self.verbs = set() + self.roots = set() + + def add_root(self, root): + self.roots.add(root) + + def add_verb(self, verb): + self.verbs.add(verb) + +def handle(n, data): + if F.language.v(n) != 'hbo': # Ancient Hebrew + return + data.add_verb(Verb(n)) + data.add_root(Root(n)) + +def main(): + TF = Fabric( + modules=['hebrew/etcbc4c'], + locations='~/VersionControl/etcbc-data', + silent=True) + api = TF.load('language g_word_utf8 lex_utf8 vs vt gn nu ps', silent=True) + api.makeAvailableIn(globals()) + + data = Databank() + + for n in N(): + try: + handle(n, data) + except (KeyError, ValueError): + pass + + print(len(data.verbs), len(data.roots)) + + with open('etcbc-verbs.csv', 'w') as csvverbs: + verbwr = csv.writer(csvverbs, quoting=csv.QUOTE_MINIMAL) + #verbwr.writerow(['id', 'verb','root','stem','tense','person','gender','number','active']) + i = VERB_STARTID + for verb in data.verbs: + verbwr.writerow([ + i, + verb.verb, + verb.root, + verb.stem, + verb.tense, + verb.person if verb.person is not None else 'NULL', + verb.gender if verb.gender is not None else 'NULL', + verb.number if verb.number is not None else 'NULL', + 1 + ]) + i += 1 + + with open('etcbc-roots.csv', 'w') as csvroots: + rootwr = csv.writer(csvroots, quoting=csv.QUOTE_MINIMAL) + #rootwr.writerow(['id', 'root', 'root_kind_id']) + i = ROOT_STARTID + for root in data.roots: + rootwr.writerow([i, root.lex, 1]) + i += 1 + + +if __name__ == '__main__': + main() |