diff options
Diffstat (limited to 'df.py')
-rwxr-xr-x | df.py | 26 |
1 files changed, 26 insertions, 0 deletions
@@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +import os +import json +from collections import Counter + +if __name__ == '__main__': + queries = dict() + with open('queries_stopped.json') as f: + queries = json.load(f) + terms = set([t for q in queries.values() for t in q.split()]) + + store = dict() + + for filename in os.listdir('information-retrieval-data/'): + with open('information-retrieval-data/' + filename) as f: + entity = json.load(f) + for field, values in entity.items(): + if field not in store: + store[field] = [] + store[field] += [v.lower() for value in values for v in value.split() if v in terms] + + for field in store: + cnt = Counter(store[field]) + for term in cnt.items(): + print('{}\t{}\t{}'.format(field, term[0], term[1])) |