From 1ab437c62f7a4d59f93a0de9c590afbdfbc55a58 Mon Sep 17 00:00:00 2001 From: Camil Staps Date: Fri, 15 Dec 2017 12:47:43 +0100 Subject: Use queries_stopped.json in tf.py->df.py --- df.py | 26 ++++++++++++++++++++++++++ tf.py | 20 -------------------- 2 files changed, 26 insertions(+), 20 deletions(-) create mode 100755 df.py delete mode 100755 tf.py diff --git a/df.py b/df.py new file mode 100755 index 0000000..0137cd8 --- /dev/null +++ b/df.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +import os +import json +from collections import Counter + +if __name__ == '__main__': + queries = dict() + with open('queries_stopped.json') as f: + queries = json.load(f) + terms = set([t for q in queries.values() for t in q.split()]) + + store = dict() + + for filename in os.listdir('information-retrieval-data/'): + with open('information-retrieval-data/' + filename) as f: + entity = json.load(f) + for field, values in entity.items(): + if field not in store: + store[field] = [] + store[field] += [v.lower() for value in values for v in value.split() if v in terms] + + for field in store: + cnt = Counter(store[field]) + for term in cnt.items(): + print('{}\t{}\t{}'.format(field, term[0], term[1])) diff --git a/tf.py b/tf.py deleted file mode 100755 index fc57f8b..0000000 --- a/tf.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env python3 - -import os -import json -from collections import Counter - -if __name__ == '__main__': - store = dict() - for filename in os.listdir('information-retrieval-data/'): - with open('information-retrieval-data/' + filename) as f: - entity = json.load(f) - for field, values in entity.items(): - for value in values: - if field not in store: - store[field] = [] - store[field] += [v.lower() for v in value.split(" ")] - for field in store: - cnt = Counter(store[field]) - for term in cnt.items(): - print('{}\t{}\t{}'.format(field, term[0], term[1])) -- cgit v1.2.3