Skip to content

Commit 83578b4

Browse files
redhat-qe-botshakyavVivek Shakya
authored
auto-cherry-pick: [v4.13] add prometheus to ocp utilities (#206)
add prometheus to ocp utilities (#134) create monitoring functionality Co-authored-by: Vivek Shakya <[email protected]> Co-authored-by: Vivek Shakya <[email protected]>
1 parent 94fc544 commit 83578b4

File tree

1 file changed

+219
-0
lines changed

1 file changed

+219
-0
lines changed

ocp_utilities/monitoring.py

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
import json
2+
import re
3+
from json import JSONDecodeError
4+
5+
import requests
6+
from ocp_resources.route import Route
7+
from ocp_resources.secret import Secret
8+
from ocp_resources.service_account import ServiceAccount
9+
from ocp_resources.utils import TimeoutExpiredError, TimeoutSampler
10+
from simple_logger.logger import get_logger
11+
12+
from ocp_utilities.infra import get_client
13+
14+
15+
TIMEOUT_2MIN = 2 * 60
16+
TIMEOUT_10MIN = 10 * 60
17+
18+
LOGGER = get_logger(name=__name__)
19+
20+
21+
class Prometheus(object):
22+
"""
23+
For accessing Prometheus cluster metrics
24+
25+
Prometheus HTTP API doc:
26+
https://prometheus.io/docs/prometheus/latest/querying/api/
27+
28+
Argument for query method should be the entire string following the server address
29+
e.g.
30+
prometheus = Prometheus()
31+
up = prometheus.query("/api/v1/query?query=up")
32+
"""
33+
34+
def __init__(
35+
self,
36+
namespace="openshift-monitoring",
37+
resource_name="prometheus-k8s",
38+
client=None,
39+
verify_ssl=True,
40+
):
41+
self.namespace = namespace
42+
self.resource_name = resource_name
43+
self.client = client or get_client()
44+
self.api_v1 = "/api/v1"
45+
self.verify_ssl = verify_ssl
46+
self.api_url = self._get_route()
47+
self.headers = self._get_headers()
48+
self.scrape_interval = self.get_scrape_interval()
49+
50+
def _get_route(self):
51+
# get route to prometheus HTTP api
52+
LOGGER.info("Prometheus: Obtaining route")
53+
route = Route(
54+
namespace=self.namespace, name=self.resource_name, client=self.client
55+
).instance.spec.host
56+
57+
return f"https://{route}"
58+
59+
def _get_headers(self):
60+
"""Uses the Prometheus serviceaccount to get an access token for OAuth"""
61+
62+
LOGGER.info("Setting Prometheus headers and Obtaining OAuth token")
63+
64+
secret = self._get_resource_secret()
65+
66+
token = secret.instance.metadata.annotations["openshift.io/token-secret.value"]
67+
68+
return {"Authorization": f"Bearer {token}"}
69+
70+
def _get_service_account(self):
71+
"""get service account for the given namespace and resource"""
72+
73+
return ServiceAccount(
74+
namespace=self.namespace, name=self.resource_name, client=self.client
75+
)
76+
77+
def _get_resource_secret(self):
78+
"""secret for the service account extracted"""
79+
resource_sa = self._get_service_account()
80+
return Secret(
81+
namespace=self.namespace,
82+
name=resource_sa.instance.imagePullSecrets[0].name,
83+
client=self.client,
84+
)
85+
86+
def _get_response(self, query):
87+
response = requests.get(
88+
f"{self.api_url}{query}", headers=self.headers, verify=self.verify_ssl
89+
)
90+
91+
try:
92+
return json.loads(response.content)
93+
except JSONDecodeError as json_exception:
94+
LOGGER.error(
95+
"Exception converting query response to JSON: "
96+
f"exc={json_exception} response_status_code={response.status_code} response={response.content}"
97+
)
98+
raise
99+
100+
def query(self, query):
101+
"""
102+
get the prometheus query result
103+
104+
Args:
105+
query (str): promthetheus query string
106+
107+
Returns:
108+
dict: query result
109+
"""
110+
return self._get_response(query=f"{self.api_v1}/query?query={query}")
111+
112+
def get_all_alerts_by_alert_name(self, alert_name):
113+
"""
114+
Get alert by alert name if it's an active alert
115+
116+
Args:
117+
alert (str): alert name
118+
119+
Examples:
120+
result = prometheus.get_alert(alert='WatchDog')
121+
122+
Returns:
123+
list: list containing alert metrics
124+
"""
125+
alerts = self.alerts()
126+
alert_list = []
127+
for alert in alerts["data"]["alerts"]:
128+
if alert["labels"]["alertname"] == alert_name:
129+
alert_list.append(alert)
130+
return alert_list
131+
132+
def get_firing_alerts(self, alert_name):
133+
"""
134+
get all the firing alerts from list of active alerts
135+
"""
136+
alert_list = self.get_all_alerts_by_alert_name(alert_name=alert_name)
137+
return [alert for alert in alert_list if alert["state"] == "firing"]
138+
139+
def wait_for_firing_alert_sampler(self, alert_name, timeout=TIMEOUT_10MIN):
140+
"""
141+
Sample output for an alert if found in fired state
142+
143+
Args:
144+
alert (str): alert name
145+
timeout (int): wait time, default is 10 mins
146+
147+
Return:
148+
sample (list): list of all alerts that match the alert name and in firing state
149+
150+
Raise:
151+
TimeoutExpiredError: if alert is not fired before wait_timeout
152+
"""
153+
sampler = TimeoutSampler(
154+
wait_timeout=timeout,
155+
sleep=self.scrape_interval,
156+
func=self.get_firing_alerts,
157+
alert_name=alert_name,
158+
)
159+
try:
160+
for sample in sampler:
161+
if sample:
162+
LOGGER.info(f"Found alert: {alert_name} in firing state.")
163+
return sample
164+
165+
except TimeoutExpiredError:
166+
LOGGER.error(f"{alert_name} currently not in firing state")
167+
raise
168+
169+
def get_scrape_interval(self):
170+
"""
171+
get prometheus scrap interval
172+
173+
Returns:
174+
int: scrape time interval or default 30 if not found
175+
"""
176+
response = self._get_response(query=f"{self.api_v1}/targets")
177+
result = response["data"]["activeTargets"]
178+
for item in result:
179+
if item and item["labels"]["job"] == "prometheus-k8s":
180+
scrape_interval = item["scrapeInterval"]
181+
return int((re.match(r"\d+", scrape_interval)).group())
182+
return 30
183+
184+
def query_sampler(self, query, timeout=TIMEOUT_2MIN):
185+
"""
186+
Sample output for query function
187+
188+
Args:
189+
query (str): prometheus query string
190+
wait_timeout (int): default is 2 mins
191+
192+
Return:
193+
list: return the query result
194+
195+
Raise:
196+
TimeoutExpiredError: if query response doesn't return success
197+
"""
198+
sampler = TimeoutSampler(
199+
wait_timeout=timeout,
200+
sleep=self.scrape_interval,
201+
func=self.query,
202+
query=query,
203+
)
204+
try:
205+
for sample in sampler:
206+
result = sample.get("data", {}).get("result")
207+
if result and sample["status"] == "success":
208+
return result
209+
except TimeoutExpiredError:
210+
LOGGER.error(
211+
f"Failed to get successful status after executing query '{query}'"
212+
)
213+
raise
214+
215+
def alerts(self):
216+
"""
217+
get all the active alerts
218+
"""
219+
return self._get_response(query=f"{self.api_v1}/alerts")

0 commit comments

Comments
 (0)