diff options
author | Camil Staps | 2017-12-15 12:47:43 +0100 |
---|---|---|
committer | Camil Staps | 2017-12-15 12:47:43 +0100 |
commit | 1ab437c62f7a4d59f93a0de9c590afbdfbc55a58 (patch) | |
tree | 7763beb22b52631817d8762570291993c1d167b0 | |
parent | Merge branch 'implementation' of github.com:rubigdata/IR-2017-4 into implemen... (diff) |
Use queries_stopped.json in tf.py->df.py
-rwxr-xr-x | df.py (renamed from tf.py) | 12 |
1 files changed, 9 insertions, 3 deletions
@@ -5,15 +5,21 @@ import json from collections import Counter if __name__ == '__main__': + queries = dict() + with open('queries_stopped.json') as f: + queries = json.load(f) + terms = set([t for q in queries.values() for t in q.split()]) + store = dict() + for filename in os.listdir('information-retrieval-data/'): with open('information-retrieval-data/' + filename) as f: entity = json.load(f) - for field, values in entity.items(): - for value in values: + for field, values in entity.items(): if field not in store: store[field] = [] - store[field] += [v.lower() for v in value.split(" ")] + store[field] += [v.lower() for value in values for v in value.split() if v in terms] + for field in store: cnt = Counter(store[field]) for term in cnt.items(): |