aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCamil Staps2017-09-20 20:04:01 +0200
committerCamil Staps2017-09-20 20:04:01 +0200
commitf52e212ed7b61bbac32e2e0ee3e6f150e3d9b918 (patch)
treebccd76aa9a89b48f24aab1063bdf3825a7f134c5
parentUse <select> for root again; fix checking answers (diff)
Simple ETCBC importer (#2 - still needs some work); requires case-sensitive collation
-rw-r--r--config/database.php2
-rwxr-xr-xtools/import_etcbc.py140
2 files changed, 141 insertions, 1 deletions
diff --git a/config/database.php b/config/database.php
index fd22e8e..427cdbd 100644
--- a/config/database.php
+++ b/config/database.php
@@ -60,7 +60,7 @@ return [
'username' => env('DB_USERNAME', 'forge'),
'password' => env('DB_PASSWORD', ''),
'charset' => 'utf8',
- 'collation' => 'utf8_unicode_ci',
+ 'collation' => 'utf8_bin',
'prefix' => '',
'strict' => true,
'engine' => null,
diff --git a/tools/import_etcbc.py b/tools/import_etcbc.py
new file mode 100755
index 0000000..4d752e7
--- /dev/null
+++ b/tools/import_etcbc.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+
+from tf.fabric import Fabric
+import csv
+import re
+
+VERB_STARTID=13
+ROOT_STARTID=2
+
+STEMS = {
+ 'qal': 'Qal',
+ 'hif': 'Hiphil',
+ 'piel': 'Piel',
+ 'nif': 'Niphal',
+ 'hit': 'Hitpael',
+ 'pual': 'Pual',
+ 'hof': 'Hophal'
+ }
+
+TENSES = {
+ 'perf': 'perfect',
+ 'impf': 'imperfect',
+ #'wayq': 'wayyiqtol',
+ 'ptca': 'participle active',
+ 'infc': 'infinitive construct',
+ 'impv': 'imperative',
+ 'ptcp': 'participle passive',
+ 'infa': 'infinitive absolute'
+ }
+
+PERSONS = {'p1': 1, 'p2': 2, 'p3': 3, 'unknown': None}
+GENDERS = {'m': 'm', 'f': 'f', 'unkown': None}
+NUMBERS = {'sg': 's', 'pl': 'p', 'unknown': None}
+
+class Root:
+ def __init__(self, n):
+ self.lex = F.lex_utf8.v(n)
+
+ def __eq__(self, other):
+ return self.lex == other.lex
+
+ def __hash__(self):
+ return hash(self.lex)
+
+class Verb:
+ def __init__(self, n):
+ self.n = n
+ verb = F.g_word_utf8.v(n)
+ if verb is None or '\u05c3' in verb or '\u05be' in verb:
+ raise ValueError('no text, sof pasuq or maqaf')
+ # strip accents
+ self.verb = re.sub(r'[^\u05b0-\u05bc\u05c1\u05c2\u05c7-\u05ea]', '', verb)
+ self.root = F.lex_utf8.v(n)
+ self.stem = STEMS[F.vs.v(n)]
+ self.tense = TENSES[F.vt.v(n)]
+ self.person = PERSONS[F.ps.v(n)]
+ self.gender = GENDERS[F.gn.v(n)]
+ self.number = NUMBERS[F.nu.v(n)]
+ self.loc = T.sectionFromNode(n)
+
+ def unpointed_word(self):
+ return re.sub(r'[^\u05d0-\u05ea]', '', self.verb)
+
+ def __eq__(self, other):
+ return self.unpointed_word() == other.unpointed_word() and \
+ self.root == other.root and \
+ self.stem == other.stem and \
+ self.tense == other.tense and \
+ self.person == other.person and \
+ self.gender == other.gender and \
+ self.number == other.number
+
+ def __hash__(self):
+ return hash((self.unpointed_word(), self.root, self.stem, self.tense,
+ self.person, self.gender, self.number))
+
+class Databank:
+ def __init__(self):
+ self.verbs = set()
+ self.roots = set()
+
+ def add_root(self, root):
+ self.roots.add(root)
+
+ def add_verb(self, verb):
+ self.verbs.add(verb)
+
+def handle(n, data):
+ if F.language.v(n) != 'hbo': # Ancient Hebrew
+ return
+ data.add_verb(Verb(n))
+ data.add_root(Root(n))
+
+def main():
+ TF = Fabric(
+ modules=['hebrew/etcbc4c'],
+ locations='~/VersionControl/etcbc-data',
+ silent=True)
+ api = TF.load('language g_word_utf8 lex_utf8 vs vt gn nu ps', silent=True)
+ api.makeAvailableIn(globals())
+
+ data = Databank()
+
+ for n in N():
+ try:
+ handle(n, data)
+ except (KeyError, ValueError):
+ pass
+
+ print(len(data.verbs), len(data.roots))
+
+ with open('etcbc-verbs.csv', 'w') as csvverbs:
+ verbwr = csv.writer(csvverbs, quoting=csv.QUOTE_MINIMAL)
+ #verbwr.writerow(['id', 'verb','root','stem','tense','person','gender','number','active'])
+ i = VERB_STARTID
+ for verb in data.verbs:
+ verbwr.writerow([
+ i,
+ verb.verb,
+ verb.root,
+ verb.stem,
+ verb.tense,
+ verb.person if verb.person is not None else 'NULL',
+ verb.gender if verb.gender is not None else 'NULL',
+ verb.number if verb.number is not None else 'NULL',
+ 1
+ ])
+ i += 1
+
+ with open('etcbc-roots.csv', 'w') as csvroots:
+ rootwr = csv.writer(csvroots, quoting=csv.QUOTE_MINIMAL)
+ #rootwr.writerow(['id', 'root', 'root_kind_id'])
+ i = ROOT_STARTID
+ for root in data.roots:
+ rootwr.writerow([i, root.lex, 1])
+ i += 1
+
+
+if __name__ == '__main__':
+ main()