From 7c668cf2a00f770f54ac11597d9149fa42f502a5 Mon Sep 17 00:00:00 2001 From: Camil Staps Date: Fri, 15 Dec 2017 12:38:43 +0100 Subject: Add analysis on qrels --- Implementation.md | 40 +++++++++++++++++++++++++++++++++++++++- run.py | 9 ++++++--- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/Implementation.md b/Implementation.md index e340ace..1a81e6f 100644 --- a/Implementation.md +++ b/Implementation.md @@ -39,6 +39,10 @@ set of documents, *Q* the set of queries, *tf* the function that counts the amount of times any of the query terms was found in that field and |*f*| the size of the field. +The formula assumes that relevance is more or less linear. The logarithm is +used because more occurrences of the same term are not as important as the +first occurrence. + ## Code We use three Python programs that: @@ -201,7 +205,8 @@ The system is agnostic with regards to the ranking function (BM25 or another method). ## Intermediate Results -These are the thirty most important fields as found by our measure: +These are the thirty most important fields as found by our measure when used on +the BM25 relevance scores: | Field | Score | Used by Nordlys | |------------------------------|----------:|:---------------:| @@ -246,5 +251,38 @@ In fact, we expect that many of the fields not used actually display similarities with fields that *are* indexed. For example, the `` field will probably match because the title is repeated in the abstract. +We can perform the same analysis on the human assessments. This gives a rather +different list of fields: + +| Field | Score | Rank for BM25 | Used by Nordlys | +|-------------------------------|---------:|--------------:|:---------------:| +| `` | 133.77 | 28 | ![][n] | +| `` | 136.32 | 266 | ![][n] | +| `` | 139.85 | 13 | ![][n] | +| `` | 164.91 | 49 | ![][n] | +| `` | 166.35 | 30 | ![][n] | +| `` | 170.93 | 11 | ![][n] | +| `` | 173.92 | 299 | ![][n] | +| `` | 186.37 | 12 | ![][n] | +| `` | 297.25 | 802 | ![][n] | +| `` | 328.93 | 10 | ![][n] | +| `` | 332.05 | 8 | ![][n] | +| `` | 334.79 | 9 | ![][n] | +| `` | 648.73 | 7 | ![][n] | +| `` | 1436.74 | 21 | ![][y] | +| `` | 1961.98 | 6 | ![][y] | +| `` | 2086.67 | 5 | ![][y] | +| `` | 2897.51 | 4 | ![][y] | +| `` | 3483.06 | 3 | ![][y] | +| `` | 12323.46 | 2 | ![][n] | +| `` | 13002.74 | 1 | ![][n] | + +Based on this, one may want to try adding fields like `` to the +index. + +Conversely, this information can also be used to improve the relevance measure. +Apparently, ``, `` and `` are +quite relevant according to human assessors, but not at all according to BM25. + [y]: http://i.stack.imgur.com/iro5J.png [n]: http://i.stack.imgur.com/asAya.png diff --git a/run.py b/run.py index 7b42ea8..1551717 100755 --- a/run.py +++ b/run.py @@ -22,7 +22,10 @@ def match(value, terms): return False def run(queries, line): - query, _, dbpediaid, _, relevance, method = line.split('\t') + try: + query, _, dbpediaid, _, relevance, method = line.split('\t') + except ValueError: # For qrels.txt + query, _, dbpediaid, relevance = line.split('\t') terms = queries[query].split() try: result = get(dbpediaid) @@ -33,8 +36,8 @@ def run(queries, line): for value in values: if match(value, terms): matches += 1 - print('{}\t{}\t{}\t{}\t{}\t{}\n'.format( - query, dbpediaid, relevance, field, len(values), matches)) + print('{}\t{}\t{}\t{}\t{}\t{}'.format( + query, dbpediaid, float(relevance), field, len(values), matches)) except Exception as e: print(dbpediaid) print(e) -- cgit v1.2.3