From 75e913ed3fb1172b27e2d7eaae7ad8bd7b57538e Mon Sep 17 00:00:00 2001 From: Erin van der Veen Date: Fri, 15 Dec 2017 12:42:21 +0100 Subject: Calculate term frequency --- tf.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100755 tf.py diff --git a/tf.py b/tf.py new file mode 100755 index 0000000..fc57f8b --- /dev/null +++ b/tf.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +import os +import json +from collections import Counter + +if __name__ == '__main__': + store = dict() + for filename in os.listdir('information-retrieval-data/'): + with open('information-retrieval-data/' + filename) as f: + entity = json.load(f) + for field, values in entity.items(): + for value in values: + if field not in store: + store[field] = [] + store[field] += [v.lower() for v in value.split(" ")] + for field in store: + cnt = Counter(store[field]) + for term in cnt.items(): + print('{}\t{}\t{}'.format(field, term[0], term[1])) -- cgit v1.2.3