diff options
author | Camil Staps | 2017-12-01 13:30:17 +0100 |
---|---|---|
committer | Camil Staps | 2017-12-01 13:30:17 +0100 |
commit | 54fe6461ab6afa5a506fe11fc9f583e7adb18045 (patch) | |
tree | 7ca0fe3093ca103d4e8cf9775203db98b7f234d5 /scrape.py | |
parent | plan (diff) |
Scraper
Diffstat (limited to 'scrape.py')
-rwxr-xr-x | scrape.py | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/scrape.py b/scrape.py new file mode 100755 index 0000000..067a16c --- /dev/null +++ b/scrape.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +import fileinput +import os +from urllib.parse import quote_plus +from urllib.request import urlopen, Request + +OUTDIR = 'data' +ERRORFILE = 'errors' + +def get(dbpediaid): + outfile = os.path.join(OUTDIR, quote_plus(dbpediaid) + '.json') + if os.path.isfile(outfile): + return + url = 'http://api.nordlys.cc/ec/lookup_id/{}'.format( + quote_plus(dbpediaid.replace('/', '___SLASH')).replace('___SLASH', '/')) + print(url) + result = urlopen(Request(url, + headers={'User-Agent': 'Radboud University'})).read() + with open(outfile, 'w') as f: + f.write(result.decode(encoding='UTF-8')) + +def scrape(line): + index, query, dbpediaid, relevance = line.split('\t') + try: + get(dbpediaid) + except Exception as e: + with open(ERRORFILE, 'a') as f: + f.write(dbpediaid + '\t' + e + '\n') + +if __name__ == '__main__': + for line in fileinput.input(): + scrape(line) |