aboutsummaryrefslogtreecommitdiff
path: root/df.py
diff options
context:
space:
mode:
Diffstat (limited to 'df.py')
-rwxr-xr-xdf.py26
1 files changed, 26 insertions, 0 deletions
diff --git a/df.py b/df.py
new file mode 100755
index 0000000..0137cd8
--- /dev/null
+++ b/df.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+import os
+import json
+from collections import Counter
+
+if __name__ == '__main__':
+ queries = dict()
+ with open('queries_stopped.json') as f:
+ queries = json.load(f)
+ terms = set([t for q in queries.values() for t in q.split()])
+
+ store = dict()
+
+ for filename in os.listdir('information-retrieval-data/'):
+ with open('information-retrieval-data/' + filename) as f:
+ entity = json.load(f)
+ for field, values in entity.items():
+ if field not in store:
+ store[field] = []
+ store[field] += [v.lower() for value in values for v in value.split() if v in terms]
+
+ for field in store:
+ cnt = Counter(store[field])
+ for term in cnt.items():
+ print('{}\t{}\t{}'.format(field, term[0], term[1]))