From 7c668cf2a00f770f54ac11597d9149fa42f502a5 Mon Sep 17 00:00:00 2001
From: Camil Staps
Date: Fri, 15 Dec 2017 12:38:43 +0100
Subject: Add analysis on qrels

---
 Implementation.md | 40 +++++++++++++++++++++++++++++++++++++++-
 run.py            |  9 ++++++---
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/Implementation.md b/Implementation.md
index e340ace..1a81e6f 100644
--- a/Implementation.md
+++ b/Implementation.md
@@ -39,6 +39,10 @@ set of documents, *Q* the set of queries, *tf* the function that counts the
 amount of times any of the query terms was found in that field and |*f*| the
 size of the field.
 
+The formula assumes that relevance is more or less linear. The logarithm is
+used because more occurrences of the same term are not as important as the
+first occurrence.
+
 ## Code
 
 We use three Python programs that:
@@ -201,7 +205,8 @@ The system is agnostic with regards to the ranking function (BM25 or another
 method).
 
 ## Intermediate Results
-These are the thirty most important fields as found by our measure:
+These are the thirty most important fields as found by our measure when used on
+the BM25 relevance scores:
 
 | Field                        | Score     | Used by Nordlys |
 |------------------------------|----------:|:---------------:|
@@ -246,5 +251,38 @@ In fact, we expect that many of the fields not used actually display
 similarities with fields that *are* indexed. For example, the `<dbo:abstract>`
 field will probably match because the title is repeated in the abstract.
 
+We can perform the same analysis on the human assessments. This gives a rather
+different list of fields:
+
+| Field                         | Score    | Rank for BM25 | Used by Nordlys |
+|-------------------------------|---------:|--------------:|:---------------:|
+| `<dbp:pushpinMapCaption>`     |   133.77 |            28 | ![][n]          |
+| `<dbp:foundation>`            |   136.32 |           266 | ![][n]          |
+| `<dbp:imageCaption>`          |   139.85 |            13 | ![][n]          |
+| `<dbp:bridgeName>`            |   164.91 |            49 | ![][n]          |
+| `<dbp:imageFlag>`             |   166.35 |            30 | ![][n]          |
+| `<dbp:mapCaption>`            |   170.93 |            11 | ![][n]          |
+| `<dbo:foundingYear>`          |   173.92 |           299 | ![][n]          |
+| `<dbp:producer>`              |   186.37 |            12 | ![][n]          |
+| `<dbp:ground>`                |   297.25 |           802 | ![][n]          |
+| `<dbp:title>`                 |   328.93 |            10 | ![][n]          |
+| `<dc:description>`            |   332.05 |             8 | ![][n]          |
+| `<dbp:shortDescription>`      |   334.79 |             9 | ![][n]          |
+| `<dbp:caption>`               |   648.73 |             7 | ![][n]          |
+| `<foaf:givenName>`            |  1436.74 |            21 | ![][y]          |
+| `<dbp:name>`                  |  1961.98 |             6 | ![][y]          |
+| `<foaf:name>`                 |  2086.67 |             5 | ![][y]          |
+| `<dbo:wikiPageWikiLinkText>`  |  2897.51 |             4 | ![][y]          |
+| `<rdfs:label>`                |  3483.06 |             3 | ![][y]          |
+| `<rdfs:comment>`              | 12323.46 |             2 | ![][n]          |
+| `<dbo:abstract>`              | 13002.74 |             1 | ![][n]          |
+
+Based on this, one may want to try adding fields like `<dbp:caption>` to the
+index.
+
+Conversely, this information can also be used to improve the relevance measure.
+Apparently, `<dbp:ground>`, `<dbo:foundingYear>` and `<dbp:foundation>` are
+quite relevant according to human assessors, but not at all according to BM25.
+
 [y]: http://i.stack.imgur.com/iro5J.png
 [n]: http://i.stack.imgur.com/asAya.png
diff --git a/run.py b/run.py
index 7b42ea8..1551717 100755
--- a/run.py
+++ b/run.py
@@ -22,7 +22,10 @@ def match(value, terms):
     return False
 
 def run(queries, line):
-    query, _, dbpediaid, _, relevance, method = line.split('\t')
+    try:
+        query, _, dbpediaid, _, relevance, method = line.split('\t')
+    except ValueError: # For qrels.txt
+        query, _, dbpediaid, relevance = line.split('\t')
     terms = queries[query].split()
     try:
         result = get(dbpediaid)
@@ -33,8 +36,8 @@ def run(queries, line):
             for value in values:
                 if match(value, terms):
                     matches += 1
-            print('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
-                query, dbpediaid, relevance, field, len(values), matches))
+            print('{}\t{}\t{}\t{}\t{}\t{}'.format(
+                query, dbpediaid, float(relevance), field, len(values), matches))
     except Exception as e:
         print(dbpediaid)
         print(e)
-- 
cgit v1.2.3