aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xdf.py (renamed from tf.py)12
1 files changed, 9 insertions, 3 deletions
diff --git a/tf.py b/df.py
index fc57f8b..0137cd8 100755
--- a/tf.py
+++ b/df.py
@@ -5,15 +5,21 @@ import json
from collections import Counter
if __name__ == '__main__':
+ queries = dict()
+ with open('queries_stopped.json') as f:
+ queries = json.load(f)
+ terms = set([t for q in queries.values() for t in q.split()])
+
store = dict()
+
for filename in os.listdir('information-retrieval-data/'):
with open('information-retrieval-data/' + filename) as f:
entity = json.load(f)
- for field, values in entity.items():
- for value in values:
+ for field, values in entity.items():
if field not in store:
store[field] = []
- store[field] += [v.lower() for v in value.split(" ")]
+ store[field] += [v.lower() for value in values for v in value.split() if v in terms]
+
for field in store:
cnt = Counter(store[field])
for term in cnt.items():