run.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

#!/usr/bin/env python3

import fileinput
import json
import os
from urllib.parse import quote_plus
from urllib.request import urlopen, Request

DATADIR = 'data'
ERRORFILE = 'errors'

def get(dbpediaid):
    outfile = os.path.join(DATADIR, quote_plus(dbpediaid) + '.json')
    if not os.path.isfile(outfile):
        url = 'http://api.nordlys.cc/ec/lookup_id/{}'.format(
                quote_plus(dbpediaid.replace('/', '___SLASH')).replace('___SLASH', '/'))
        print(url)
        result = urlopen(Request(url,
            headers={'User-Agent': 'Radboud University'})).read()
        with open(outfile, 'w') as f:
            f.write(result.decode(encoding='UTF-8'))
    with open(outfile) as f:
        return json.load(f)

def match(value, terms):
    for v in value.split():
        if v in terms:
            return True
    return False

def run(queries, line, outfile):
    query, _, dbpediaid, _, relevance, method = line.split('\t')
    terms = queries[query].split()
    try:
        result = get(dbpediaid)
        for field, values in result.items():
            matches = 0
            for value in values:
                if match(value, terms):
                    matches += 1
            outfile.write('{}\t{}\t{}\t{}\t{}\n'.format(
                query, dbpediaid, field, len(values), matches))
    except Exception as e:
        print(dbpediaid)
        print(e)
        with open(ERRORFILE, 'a') as f:
            f.write(dbpediaid + '\t' + e + '\n')

if __name__ == '__main__':
    with open('queries_stopped.json') as f:
        queries = json.load(f)

        with open('run.txt', 'w') as out:
            for line in fileinput.input():
                run(queries, line, out)