aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorErin van der Veen2017-12-15 12:42:21 +0100
committerErin van der Veen2017-12-15 12:42:21 +0100
commit75e913ed3fb1172b27e2d7eaae7ad8bd7b57538e (patch)
tree26472c05da39f7303bd7390d8e51ed939370a3f7
parentUse logarithm for weight measure; add code explanation (diff)
Calculate term frequency
-rwxr-xr-xtf.py20
1 files changed, 20 insertions, 0 deletions
diff --git a/tf.py b/tf.py
new file mode 100755
index 0000000..fc57f8b
--- /dev/null
+++ b/tf.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+import os
+import json
+from collections import Counter
+
+if __name__ == '__main__':
+ store = dict()
+ for filename in os.listdir('information-retrieval-data/'):
+ with open('information-retrieval-data/' + filename) as f:
+ entity = json.load(f)
+ for field, values in entity.items():
+ for value in values:
+ if field not in store:
+ store[field] = []
+ store[field] += [v.lower() for v in value.split(" ")]
+ for field in store:
+ cnt = Counter(store[field])
+ for term in cnt.items():
+ print('{}\t{}\t{}'.format(field, term[0], term[1]))