Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
build
gradle-wrapper.jar
gradle-wrapper.properties
gradlew
gradlew.bat

# Mac
.DS_Store
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ It collects all relevant metrics and makes them available to Prometheus via the
- Circuit Breaker
- Indices status
- Cluster settings (selected [disk allocation settings](https://www.elastic.co/guide/en/elasticsearch/reference/master/disk-allocator.html) only)
- Snapshot lifecycle stats:
- retention run
- per policy-id stats (snapshot taken, failed, deleted, failed delete)

## Compatibility matrix

Expand Down Expand Up @@ -80,6 +83,11 @@ To disable exporting cluster settings use:
prometheus.cluster.settings: false
```

To disable exporting snapshot lifecycle stats use:
```
prometheus.slm: false
```

These settings can be also [updated dynamically](https://www.elastic.co/guide/en/elasticsearch/reference/master/cluster-update-settings.html).

## Uninstall
Expand Down
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ configurations {

dependencies {
compile "org.elasticsearch:elasticsearch:${versions.elasticsearch}"
compile "org.elasticsearch.client:x-pack-transport:${versions.elasticsearch}"
compile "io.prometheus:simpleclient:${versions.prometheus}"
compile "io.prometheus:simpleclient_common:${versions.prometheus}"
compile "org.apache.logging.log4j:log4j-api:${versions.log4j}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,11 @@
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.rest.prometheus.RestPrometheusMetricsAction;

import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.Locale;

import io.prometheus.client.CollectorRegistry;
import io.prometheus.client.Gauge;
import io.prometheus.client.Summary;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.compuscene.metrics.prometheus;

import org.elasticsearch.action.ClusterStatsData;
import org.elasticsearch.action.PrometheusSnapshotLifecycleStats;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
import org.elasticsearch.action.admin.cluster.node.stats.NodeStats;
import org.elasticsearch.action.admin.indices.stats.CommonStats;
Expand All @@ -37,11 +38,9 @@
import org.elasticsearch.script.ScriptStats;
import org.elasticsearch.threadpool.ThreadPoolStats;
import org.elasticsearch.transport.TransportStats;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

import io.prometheus.client.Summary;

/**
Expand All @@ -51,13 +50,16 @@ public class PrometheusMetricsCollector {

private boolean isPrometheusClusterSettings;
private boolean isPrometheusIndices;
private boolean isPrometheusSlm;
private PrometheusMetricsCatalog catalog;

public PrometheusMetricsCollector(PrometheusMetricsCatalog catalog,
boolean isPrometheusIndices,
boolean isPrometheusClusterSettings) {
boolean isPrometheusClusterSettings,
boolean isPrometheusSlm) {
this.isPrometheusClusterSettings = isPrometheusClusterSettings;
this.isPrometheusIndices = isPrometheusIndices;
this.isPrometheusSlm = isPrometheusSlm;
this.catalog = catalog;
}

Expand All @@ -79,6 +81,7 @@ public void registerMetrics() {
registerOsMetrics();
registerFsMetrics();
registerESSettings();
registerSlmMetrics();
}

private void registerClusterMetrics() {
Expand Down Expand Up @@ -916,8 +919,38 @@ private void updateESSettings(ClusterStatsData stats) {
}
}

private void registerSlmMetrics() {
catalog.registerClusterGauge("slm_retention_run_count", "SLM Retention Run Count");
catalog.registerClusterGauge("slm_retention_failed_count", "SLM Retention Failed Count");
catalog.registerClusterGauge("slm_retention_timed_out", "SLM Retention Timed Out");
catalog.registerClusterGauge("slm_snapshot_taken", "SLM Snapshot Taken", "policy_id");
catalog.registerClusterGauge("slm_snapshot_failed", "SLM Snapshot Failed", "policy_id");
catalog.registerClusterGauge("slm_snapshot_deleted", "SLM Snapshot Deleted", "policy_id");
catalog.registerClusterGauge("slm_snapshot_delete_failure", "SLM Snapshot Delete Failure", "policy_id");
}

private void updateSlmMetrics(PrometheusSnapshotLifecycleStats slmStats) {
if (slmStats != null) {
catalog.setClusterGauge("slm_retention_run_count", slmStats.getRetentionRunCount().count());
catalog.setClusterGauge("slm_retention_failed_count", slmStats.getRetentionFailedCount().count());
catalog.setClusterGauge("slm_retention_timed_out", slmStats.getRetentionTimedOut().count());
for (Map.Entry<String, PrometheusSnapshotLifecycleStats.PrometheusSnapshotPolicyStats> entry :
slmStats.getPolicyStats().entrySet()) {
catalog.setClusterGauge("slm_snapshot_taken", entry.getValue().getSnapshotsTaken().count(),
entry.getValue().getPolicyId());
catalog.setClusterGauge("slm_snapshot_failed", entry.getValue().getSnapshotsFailed().count(),
entry.getValue().getPolicyId());
catalog.setClusterGauge("slm_snapshot_deleted", entry.getValue().getSnapshotsDeleted().count(),
entry.getValue().getPolicyId());
catalog.setClusterGauge("slm_snapshot_delete_failure",
entry.getValue().getSnapshotDeleteFailures().count(), entry.getValue().getPolicyId());
}
}
}

public void updateMetrics(ClusterHealthResponse clusterHealthResponse, NodeStats nodeStats,
IndicesStatsResponse indicesStats, ClusterStatsData clusterStatsData) {
IndicesStatsResponse indicesStats, ClusterStatsData clusterStatsData,
PrometheusSnapshotLifecycleStats slmStats) {
Summary.Timer timer = catalog.startSummaryTimer("metrics_generate_time_seconds");

updateClusterMetrics(clusterHealthResponse);
Expand All @@ -939,6 +972,7 @@ public void updateMetrics(ClusterHealthResponse clusterHealthResponse, NodeStats
if (isPrometheusClusterSettings) {
updateESSettings(clusterStatsData);
}
updateSlmMetrics(slmStats);

timer.observeDuration();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,21 @@ public class PrometheusSettings {
public static final Setting<Boolean> PROMETHEUS_INDICES =
Setting.boolSetting("prometheus.indices", true,
Setting.Property.Dynamic, Setting.Property.NodeScope);
public static final Setting<Boolean> PROMETHEUS_SLM =
Setting.boolSetting("prometheus.slm", true,
Setting.Property.Dynamic, Setting.Property.NodeScope);

private volatile boolean clusterSettings;
private volatile boolean indices;
private volatile boolean slm;

public PrometheusSettings(Settings settings, ClusterSettings clusterSettings) {
setPrometheusClusterSettings(PROMETHEUS_CLUSTER_SETTINGS.get(settings));
setPrometheusIndices(PROMETHEUS_INDICES.get(settings));
setPrometheusSlm(PROMETHEUS_SLM.get(settings));
clusterSettings.addSettingsUpdateConsumer(PROMETHEUS_CLUSTER_SETTINGS, this::setPrometheusClusterSettings);
clusterSettings.addSettingsUpdateConsumer(PROMETHEUS_INDICES, this::setPrometheusIndices);
clusterSettings.addSettingsUpdateConsumer(PROMETHEUS_SLM, this::setPrometheusSlm);
}

private void setPrometheusClusterSettings(boolean flag) {
Expand All @@ -59,11 +65,19 @@ private void setPrometheusIndices(boolean flag) {
this.indices = flag;
}

private void setPrometheusSlm(boolean flag) {
this.slm = flag;
}

public boolean getPrometheusClusterSettings() {
return this.clusterSettings;
}

public boolean getPrometheusIndices() {
return this.indices;
}

public boolean getPrometheusSlm() {
return this.slm;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
package org.elasticsearch.action;

import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.*;

import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
import org.elasticsearch.cluster.metadata.Metadata;
Expand All @@ -28,7 +27,6 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.SettingsException;
import org.elasticsearch.common.unit.RatioValue;

import java.io.IOException;


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

import org.elasticsearch.action.support.master.MasterNodeReadRequest;
import org.elasticsearch.common.io.stream.StreamInput;

import java.io.IOException;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,46 +17,117 @@

package org.elasticsearch.action;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.compuscene.metrics.prometheus.PrometheusSettings;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
import org.elasticsearch.action.admin.cluster.node.stats.NodeStats;
import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse;
import org.elasticsearch.action.admin.indices.stats.PackageAccessHelper;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.*;
import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.settings.Settings;

import org.elasticsearch.rest.prometheus.RestPrometheusMetricsAction;
import org.elasticsearch.xpack.core.slm.SnapshotLifecycleMetadata;
import org.elasticsearch.xpack.core.slm.SnapshotLifecycleStats;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;

/**
* Action response class for Prometheus Exporter plugin.
*/
public class NodePrometheusMetricsResponse extends ActionResponse {
private static final Logger logger = LogManager.getLogger(RestPrometheusMetricsAction.class);
private ClusterHealthResponse clusterHealth;
private NodeStats nodeStats;
@Nullable private IndicesStatsResponse indicesStats;
private ClusterStatsData clusterStatsData = null;
private PrometheusSnapshotLifecycleStats slmStats = null;

public NodePrometheusMetricsResponse(StreamInput in) throws IOException {
super(in);
clusterHealth = new ClusterHealthResponse(in);
nodeStats = new NodeStats(in);
indicesStats = PackageAccessHelper.createIndicesStatsResponse(in);
clusterStatsData = new ClusterStatsData(in);
slmStats = new PrometheusSnapshotLifecycleStats(in);
}

private InputStreamStreamInput getInputStreamFromWriteable(Writeable w) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
OutputStreamStreamOutput osso = new OutputStreamStreamOutput(baos);
ByteArrayInputStream bais = null;
InputStreamStreamInput issi;
try {
w.writeTo(osso);
bais = new ByteArrayInputStream(baos.toByteArray());
issi = new InputStreamStreamInput(bais);
} catch (IOException e) {
throw e;
} finally {
osso.close();
baos.close();
if (bais != null) {
bais.close();
}
}
return issi;
}

private PrometheusSnapshotLifecycleStats getPrometheusSnapshotLifecycleStats(ClusterStateResponse clusterStateResponse) {
Metadata m = clusterStateResponse.getState().getMetadata();
InputStreamStreamInput issiSnapshotLifecycleMetadata = null;
InputStreamStreamInput issiSlmStats = null;
PrometheusSnapshotLifecycleStats prometheusSnapshotLifecycleStats = null;
try {
Metadata.Custom metdataCustom = m.getCustoms().get(SnapshotLifecycleMetadata.TYPE);
if (metdataCustom != null) {
issiSnapshotLifecycleMetadata = getInputStreamFromWriteable(metdataCustom);
SnapshotLifecycleStats slmStats = new SnapshotLifecycleMetadata(issiSnapshotLifecycleMetadata).getStats();
issiSlmStats = getInputStreamFromWriteable(slmStats);
prometheusSnapshotLifecycleStats = new PrometheusSnapshotLifecycleStats(issiSlmStats);
}
} catch (IOException e) {
logger.error("Failed to get SLM stats", e);
} finally {
if (issiSnapshotLifecycleMetadata != null) {
try {
issiSnapshotLifecycleMetadata.close();
} catch (IOException e) {
logger.error("Failed to close issiSnapshotLifecycleMetadata", e);
}
}
if (issiSlmStats != null) {
try {
issiSlmStats.close();
} catch (IOException e) {
logger.error("Failed to close issiSlmStats", e);
}
}
}
return prometheusSnapshotLifecycleStats;
}

public NodePrometheusMetricsResponse(ClusterHealthResponse clusterHealth, NodeStats nodesStats,
@Nullable IndicesStatsResponse indicesStats,
@Nullable ClusterStateResponse clusterStateResponse,
Settings settings,
ClusterSettings clusterSettings) {
ClusterSettings clusterSettings,
PrometheusSettings prometheusSettings) {
this.clusterHealth = clusterHealth;
this.nodeStats = nodesStats;
this.indicesStats = indicesStats;
if (clusterStateResponse != null) {
this.clusterStatsData = new ClusterStatsData(clusterStateResponse, settings, clusterSettings);
if (prometheusSettings.getPrometheusClusterSettings()) {
this.clusterStatsData = new ClusterStatsData(clusterStateResponse, settings, clusterSettings);
}
if (prometheusSettings.getPrometheusSlm()) {
this.slmStats = getPrometheusSnapshotLifecycleStats(clusterStateResponse);
}
}
}

Expand All @@ -78,11 +149,17 @@ public ClusterStatsData getClusterStatsData() {
return this.clusterStatsData;
}

@Nullable
public PrometheusSnapshotLifecycleStats getSlmStats() {
return this.slmStats;
}

@Override
public void writeTo(StreamOutput out) throws IOException {
clusterHealth.writeTo(out);
nodeStats.writeTo(out);
out.writeOptionalWriteable(indicesStats);
clusterStatsData.writeTo(out);
slmStats.writeTo(out);
}
}
Loading