Skip to content

Commit e8a4654

Browse files
razvanmaltesander
andauthored
chore: replace jmx exporter with built in Prometheus support (#584)
* remove references to the jmx exporter agent * add test for connect Prometheus metrics endpoint * use different error variant for metric serialization * applications export Prometheus metrics * update docs * Update changelog * Update CHANGELOG.md Co-authored-by: Malte Sander <[email protected]> --------- Co-authored-by: Malte Sander <[email protected]>
1 parent 5b9e415 commit e8a4654

File tree

10 files changed

+87
-37
lines changed

10 files changed

+87
-37
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ All notable changes to this project will be documented in this file.
3333
- The `runAsUser` and `runAsGroup` fields will not be set anymore by the operator
3434
- The defaults from the docker images itself will now apply, which will be different from 1000/0 going forward
3535
- This is marked as breaking because tools and policies might exist, which require these fields to be set
36+
- BREAKING: the JMX exporter has been an replaced with the built-in Prometheus servlet. The history server pods do not expose metrics anymore ([#584])
3637

3738
### Fixed
3839

@@ -59,6 +60,7 @@ All notable changes to this project will be documented in this file.
5960
[#574]: https://github.com/stackabletech/spark-k8s-operator/pull/574
6061
[#580]: https://github.com/stackabletech/spark-k8s-operator/pull/580
6162
[#575]: https://github.com/stackabletech/spark-k8s-operator/pull/575
63+
[#584]: https://github.com/stackabletech/spark-k8s-operator/pull/584
6264

6365
## [25.3.0] - 2025-03-21
6466

docs/modules/spark-k8s/pages/usage-guide/history-server.adoc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,3 +156,11 @@ spark-history-node-cleaner NodePort 10.96.203.43 <none> 18080:325
156156
By setting up port forwarding on 18080 the UI can be opened by pointing your browser to `http://localhost:18080`:
157157

158158
image::history-server-ui.png[History Server Console]
159+
160+
== Metrics
161+
162+
[NOTE]
163+
====
164+
Up to version 25.3 of the Stackable Data Platform, the history server used the JMX exporter to expose metrics on a separate port.
165+
Starting with version 25.7 the JMX exporter has been removed and the history server doesn't expose metrics as of Spark version 3.5.6.
166+
====

docs/modules/spark-k8s/pages/usage-guide/operations/applications.adoc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,18 @@ As the operator creates the necessary resources, the status of the application t
88

99
NOTE: The operator never reconciles an application once it has been created.
1010
To resubmit an application, a new SparkApplication resource must be created.
11+
12+
== Metrics
13+
14+
[NOTE]
15+
====
16+
Up to version 25.3 of the Stackable Data Platform, Spark applications used the JMX exporter to expose metrics on a separate port.
17+
Starting with version 25.7, the built-in Prometheus servlet is used instead.
18+
====
19+
20+
Application driver pods expose Prometheus metrics at the following endpoints:
21+
22+
* `/metrics/prometheus` for driver instances
23+
* `/metrics/executors/prometheus` for executor instances.
24+
25+
These endpoints are available on the same port as the Spark UI, which is 4040 by default.

rust/operator-binary/src/connect/common.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ pub enum Error {
3434
JvmSecurityProperties {
3535
source: product_config::writer::PropertiesWriterError,
3636
},
37+
38+
#[snafu(display("failed to serialize metrics properties",))]
39+
MetricsProperties {
40+
source: product_config::writer::PropertiesWriterError,
41+
},
3742
}
3843

3944
pub(crate) fn labels<'a, T>(
@@ -149,5 +154,5 @@ pub(crate) fn metrics_properties(
149154
);
150155
}
151156

152-
to_java_properties_string(result.iter()).context(JvmSecurityPropertiesSnafu)
157+
to_java_properties_string(result.iter()).context(MetricsPropertiesSnafu)
153158
}

rust/operator-binary/src/crd/constants.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ pub const SPARK_DEFAULTS_FILE_NAME: &str = "spark-defaults.conf";
8686
pub const SPARK_ENV_SH_FILE_NAME: &str = "spark-env.sh";
8787

8888
pub const SPARK_CLUSTER_ROLE: &str = "spark-k8s-clusterrole";
89-
pub const METRICS_PORT: u16 = 18081;
9089
pub const HISTORY_UI_PORT: u16 = 18080;
9190

9291
pub const LISTENER_VOLUME_NAME: &str = "listener";

rust/operator-binary/src/crd/mod.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -649,6 +649,14 @@ impl v1alpha1::SparkApplication {
649649
)]);
650650
}
651651

652+
// Enable Prometheus metrics export
653+
submit_cmd.extend(vec![
654+
"--conf spark.metrics.conf.\\*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet".to_string(),
655+
"--conf spark.metrics.conf.\\*.sink.prometheusServlet.path=/metrics/prometheus".to_string(),
656+
"--conf spark.ui.prometheus.enabled=true".to_string(),
657+
"--conf spark.sql.streaming.metricsEnabled=true".to_string(),
658+
]);
659+
652660
// some command elements need to be initially stored in a map (to allow overwrites) and
653661
// then added to the vector once complete.
654662
let mut submit_conf: BTreeMap<String, String> = BTreeMap::new();

rust/operator-binary/src/history/config/jvm.rs

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@ use stackable_operator::role_utils::{
55

66
use crate::crd::{
77
constants::{
8-
JVM_SECURITY_PROPERTIES_FILE, LOG4J2_CONFIG_FILE, METRICS_PORT,
9-
STACKABLE_TLS_STORE_PASSWORD, STACKABLE_TRUST_STORE, VOLUME_MOUNT_PATH_CONFIG,
10-
VOLUME_MOUNT_PATH_LOG_CONFIG,
8+
JVM_SECURITY_PROPERTIES_FILE, LOG4J2_CONFIG_FILE, STACKABLE_TLS_STORE_PASSWORD,
9+
STACKABLE_TRUST_STORE, VOLUME_MOUNT_PATH_CONFIG, VOLUME_MOUNT_PATH_LOG_CONFIG,
1110
},
1211
history::HistoryConfigFragment,
1312
logdir::ResolvedLogDir,
@@ -33,9 +32,6 @@ pub fn construct_history_jvm_args(
3332
format!(
3433
"-Djava.security.properties={VOLUME_MOUNT_PATH_CONFIG}/{JVM_SECURITY_PROPERTIES_FILE}"
3534
),
36-
format!(
37-
"-javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar={METRICS_PORT}:/stackable/jmx/config.yaml"
38-
),
3935
];
4036

4137
if logdir.tls_enabled() {
@@ -86,8 +82,7 @@ mod tests {
8682
assert_eq!(
8783
jvm_config,
8884
"-Dlog4j.configurationFile=/stackable/log_config/log4j2.properties \
89-
-Djava.security.properties=/stackable/spark/conf/security.properties \
90-
-javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=18081:/stackable/jmx/config.yaml"
85+
-Djava.security.properties=/stackable/spark/conf/security.properties"
9186
);
9287
}
9388

@@ -130,7 +125,6 @@ mod tests {
130125
jvm_config,
131126
"-Dlog4j.configurationFile=/stackable/log_config/log4j2.properties \
132127
-Djava.security.properties=/stackable/spark/conf/security.properties \
133-
-javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=18081:/stackable/jmx/config.yaml \
134128
-Dhttps.proxyHost=proxy.my.corp \
135129
-Djava.net.preferIPv4Stack=true \
136130
-Dhttps.proxyPort=1234"

rust/operator-binary/src/history/history_controller.rs

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,11 @@ use crate::{
5656
constants::{
5757
ACCESS_KEY_ID, HISTORY_APP_NAME, HISTORY_CONTROLLER_NAME, HISTORY_ROLE_NAME,
5858
HISTORY_UI_PORT, JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_DIR,
59-
LISTENER_VOLUME_NAME, MAX_SPARK_LOG_FILES_SIZE, METRICS_PORT, OPERATOR_NAME,
60-
SECRET_ACCESS_KEY, SPARK_DEFAULTS_FILE_NAME, SPARK_ENV_SH_FILE_NAME,
61-
SPARK_IMAGE_BASE_NAME, STACKABLE_TRUST_STORE, VOLUME_MOUNT_NAME_CONFIG,
62-
VOLUME_MOUNT_NAME_LOG, VOLUME_MOUNT_NAME_LOG_CONFIG, VOLUME_MOUNT_PATH_CONFIG,
63-
VOLUME_MOUNT_PATH_LOG, VOLUME_MOUNT_PATH_LOG_CONFIG,
59+
LISTENER_VOLUME_NAME, MAX_SPARK_LOG_FILES_SIZE, OPERATOR_NAME, SECRET_ACCESS_KEY,
60+
SPARK_DEFAULTS_FILE_NAME, SPARK_ENV_SH_FILE_NAME, SPARK_IMAGE_BASE_NAME,
61+
STACKABLE_TRUST_STORE, VOLUME_MOUNT_NAME_CONFIG, VOLUME_MOUNT_NAME_LOG,
62+
VOLUME_MOUNT_NAME_LOG_CONFIG, VOLUME_MOUNT_PATH_CONFIG, VOLUME_MOUNT_PATH_LOG,
63+
VOLUME_MOUNT_PATH_LOG_CONFIG,
6464
},
6565
history::{self, HistoryConfig, SparkHistoryServerContainer, v1alpha1},
6666
listener_ext,
@@ -574,7 +574,6 @@ fn build_stateful_set(
574574
])
575575
.args(command_args(log_dir))
576576
.add_container_port("http", HISTORY_UI_PORT.into())
577-
.add_container_port("metrics", METRICS_PORT.into())
578577
.add_env_vars(merged_env)
579578
.add_volume_mounts(log_dir.volume_mounts())
580579
.context(AddVolumeMountSnafu)?

rust/operator-binary/src/spark_k8s_controller.rs

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ use stackable_operator::{
3737
core::{DeserializeGuard, error_boundary},
3838
runtime::{controller::Action, reflector::ObjectRef},
3939
},
40+
kvp::Label,
4041
logging::controller::ReconcilerError,
4142
product_config_utils::ValidatedRoleConfigByPropertyKind,
4243
product_logging::{
@@ -610,27 +611,32 @@ fn pod_template(
610611
);
611612
}
612613

614+
let mut omb = ObjectMetaBuilder::new();
615+
omb.name(&container_name)
616+
// this reference is not pointing to a controller but only provides a UID that can used to clean up resources
617+
// cleanly (specifically driver pods and related config maps) when the spark application is deleted.
618+
.ownerreference_from_resource(spark_application, None, None)
619+
.context(ObjectMissingMetadataForOwnerRefSnafu)?
620+
.with_recommended_labels(
621+
spark_application
622+
.build_recommended_labels(&spark_image.app_version_label, &container_name),
623+
)
624+
.context(MetadataBuildSnafu)?;
625+
626+
// Only the driver pod should be scraped by Prometheus
627+
// because the executor metrics are also available via /metrics/executors/prometheus/
628+
if role == SparkApplicationRole::Driver {
629+
omb.with_label(Label::try_from(("prometheus.io/scrape", "true")).context(LabelBuildSnafu)?);
630+
}
631+
613632
let mut pb = PodBuilder::new();
614-
pb.metadata(
615-
ObjectMetaBuilder::new()
616-
.name(&container_name)
617-
// this reference is not pointing to a controller but only provides a UID that can used to clean up resources
618-
// cleanly (specifically driver pods and related config maps) when the spark application is deleted.
619-
.ownerreference_from_resource(spark_application, None, None)
620-
.context(ObjectMissingMetadataForOwnerRefSnafu)?
621-
.with_recommended_labels(
622-
spark_application
623-
.build_recommended_labels(&spark_image.app_version_label, &container_name),
624-
)
625-
.context(MetadataBuildSnafu)?
626-
.build(),
627-
)
628-
.add_container(cb.build())
629-
.add_volumes(volumes.to_vec())
630-
.context(AddVolumeSnafu)?
631-
.security_context(security_context())
632-
.image_pull_secrets_from_product_image(spark_image)
633-
.affinity(&config.affinity);
633+
pb.metadata(omb.build())
634+
.add_container(cb.build())
635+
.add_volumes(volumes.to_vec())
636+
.context(AddVolumeSnafu)?
637+
.security_context(security_context())
638+
.image_pull_secrets_from_product_image(spark_image)
639+
.affinity(&config.affinity);
634640

635641
let init_containers = init_containers(
636642
spark_application,
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
apiVersion: kuttl.dev/v1beta1
3+
kind: TestAssert
4+
timeout: 300
5+
commands:
6+
- script: |
7+
# This endpoint (/metrics/prometheus) is also used as liveliness probe
8+
echo test Prometheus endpoint for driver metrics
9+
DRIVER_METRIC_COUNT=$(kubectl exec spark-connect-server-0 -c spark -n $NAMESPACE -- curl localhost:4040/metrics/prometheus | grep _driver_ | wc -l)
10+
test 0 -lt "$DRIVER_METRIC_COUNT"
11+
12+
echo test Prometheus endpoint for executor metrics
13+
EXECUTOR_METRIC_COUNT=$(kubectl exec spark-connect-server-0 -c spark -n $NAMESPACE -- curl localhost:4040/metrics/executors/prometheus | grep _executor_ | wc -l)
14+
test 0 -lt "$EXECUTOR_METRIC_COUNT"

0 commit comments

Comments
 (0)