aboutsummaryrefslogtreecommitdiff
path: root/df.py
blob: 0137cd84e3f4d4e1bb499e427672d984051d5ff8 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#!/usr/bin/env python3

import os
import json
from collections import Counter

if __name__ == '__main__':
    queries = dict()
    with open('queries_stopped.json') as f:
        queries = json.load(f)
    terms = set([t for q in queries.values() for t in q.split()])

    store = dict()

    for filename in os.listdir('information-retrieval-data/'):
        with open('information-retrieval-data/' + filename) as f:
            entity = json.load(f)
            for field, values in entity.items():
                if field not in store:
                    store[field] = []
                store[field] += [v.lower() for value in values for v in value.split() if v in terms]

    for field in store:
        cnt = Counter(store[field])
        for term in cnt.items():
            print('{}\t{}\t{}'.format(field, term[0], term[1]))