aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCamil Staps2017-12-15 12:47:43 +0100
committerCamil Staps2017-12-15 12:47:43 +0100
commit1ab437c62f7a4d59f93a0de9c590afbdfbc55a58 (patch)
tree7763beb22b52631817d8762570291993c1d167b0
parentMerge branch 'implementation' of github.com:rubigdata/IR-2017-4 into implemen... (diff)
Use queries_stopped.json in tf.py->df.py
-rwxr-xr-xdf.py (renamed from tf.py)12
1 files changed, 9 insertions, 3 deletions
diff --git a/tf.py b/df.py
index fc57f8b..0137cd8 100755
--- a/tf.py
+++ b/df.py
@@ -5,15 +5,21 @@ import json
from collections import Counter
if __name__ == '__main__':
+ queries = dict()
+ with open('queries_stopped.json') as f:
+ queries = json.load(f)
+ terms = set([t for q in queries.values() for t in q.split()])
+
store = dict()
+
for filename in os.listdir('information-retrieval-data/'):
with open('information-retrieval-data/' + filename) as f:
entity = json.load(f)
- for field, values in entity.items():
- for value in values:
+ for field, values in entity.items():
if field not in store:
store[field] = []
- store[field] += [v.lower() for v in value.split(" ")]
+ store[field] += [v.lower() for value in values for v in value.split() if v in terms]
+
for field in store:
cnt = Counter(store[field])
for term in cnt.items():