-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathsparql.py
89 lines (72 loc) · 2.97 KB
/
sparql.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/python3.8
# -*- coding: utf-8 -*-
# License: GNU GPL v2+
import json
import re
import time
import urllib.parse
import backoff
import requests
# Frequent paths root values
LINGUALIBRE_ENTITY = u"https://lingualibre.org/entity/"
# Keep both of these below as "http" : that's what's returned by the SPARQL requests
WIKIDATA_ENTITY = u"http://www.wikidata.org/entity/"
COMMONS_FILEPATH = u"http://commons.wikimedia.org/wiki/Special:FilePath/"
# SPARQL Service's endpointNextNext
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
# TODO better handle the exceptions coming from this
@backoff.on_exception(backoff.expo,
exception=(requests.exceptions.Timeout,
requests.exceptions.ConnectionError,
requests.exceptions.ChunkedEncodingError,
json.decoder.JSONDecodeError),
max_tries=5)
# Handle errors
def request(endpoint: str, query: str):
response = requests.post(endpoint, data={"format": "json", "query": query})
if response.status_code == 504:
print("504 Gateway Time-out\n"
"Try to use --startdate")
return ""
if response.status_code == 429:
print("Error 429 Too Many Requests")
return ""
if response.status_code == 403:
retry_after = int(response.headers["Retry-After"])
error = re.search(r'<\W*title\W*(.*)</title', response.text, re.IGNORECASE)
print(f"Error 403; {error[1]}\nWait for {retry_after} seconds")
time.sleep(retry_after)
return ""
exception_name = "MalformedQueryException"
if response.text.find(exception_name) != -1:
error = response.text
pos1 = response.text.find(exception_name) + len(exception_name) + 1
pos2 = response.text.find("\n", pos1)
error = error[pos1:pos2].strip()
print(f"MalformedQueryException: {error}")
return ""
exception_name = "TimeoutException"
if response.text.find(exception_name) != -1:
error = response.text
pos1 = response.text.find(f"java.util.concurrent.{exception_name}")
pos2 = response.text.find("\n", pos1)
error = error[pos1:pos2].strip()
print(f"TimeoutException: {error}")
return ""
return json.loads(response.text)["results"]["bindings"]
# Formating function : substitute paths, keeps value
def format_value(sparql_result, key):
if key not in sparql_result:
return None
# blank value (unknown value)
if sparql_result[key]["type"] == "bnode":
return None
value = sparql_result[key]["value"]
if sparql_result[key]["type"] == "uri":
if value.startswith(LINGUALIBRE_ENTITY):
value = value[len(LINGUALIBRE_ENTITY):]
if value.startswith(WIKIDATA_ENTITY):
value = value[len(WIKIDATA_ENTITY):]
if value.startswith(COMMONS_FILEPATH):
value = urllib.parse.unquote(value[len(COMMONS_FILEPATH):])
return value