aboutsummaryrefslogtreecommitdiff
path: root/scrape.py
diff options
context:
space:
mode:
authorCamil Staps2017-12-01 13:30:17 +0100
committerCamil Staps2017-12-01 13:30:17 +0100
commit54fe6461ab6afa5a506fe11fc9f583e7adb18045 (patch)
tree7ca0fe3093ca103d4e8cf9775203db98b7f234d5 /scrape.py
parentplan (diff)
Scraper
Diffstat (limited to 'scrape.py')
-rwxr-xr-xscrape.py33
1 files changed, 33 insertions, 0 deletions
diff --git a/scrape.py b/scrape.py
new file mode 100755
index 0000000..067a16c
--- /dev/null
+++ b/scrape.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+
+import fileinput
+import os
+from urllib.parse import quote_plus
+from urllib.request import urlopen, Request
+
+OUTDIR = 'data'
+ERRORFILE = 'errors'
+
+def get(dbpediaid):
+ outfile = os.path.join(OUTDIR, quote_plus(dbpediaid) + '.json')
+ if os.path.isfile(outfile):
+ return
+ url = 'http://api.nordlys.cc/ec/lookup_id/{}'.format(
+ quote_plus(dbpediaid.replace('/', '___SLASH')).replace('___SLASH', '/'))
+ print(url)
+ result = urlopen(Request(url,
+ headers={'User-Agent': 'Radboud University'})).read()
+ with open(outfile, 'w') as f:
+ f.write(result.decode(encoding='UTF-8'))
+
+def scrape(line):
+ index, query, dbpediaid, relevance = line.split('\t')
+ try:
+ get(dbpediaid)
+ except Exception as e:
+ with open(ERRORFILE, 'a') as f:
+ f.write(dbpediaid + '\t' + e + '\n')
+
+if __name__ == '__main__':
+ for line in fileinput.input():
+ scrape(line)