PIP-1490: Document Prometheus metrics.

mailgun · Jan 18, 2022 · a1c0512 · a1c0512
1 parent cc5d708
commit a1c0512
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -330,3 +330,6 @@ See the `example.conf` for all available config options and their descriptions.
 See [architecture.md](/architecture.md) for a full description of the architecture and the inner 
 workings of gubernator.
 
+## Monitoring
+Gubernator publishes Prometheus metrics for realtime monitoring.  See
+[prometheus.md](prometheus.md) for details.
diff --git a/grpc_stats.go b/grpc_stats.go
@@ -50,12 +50,15 @@ func NewGRPCStatsHandler() *GRPCStatsHandler {
 	c := &GRPCStatsHandler{
 		grpcRequestCount: prometheus.NewCounterVec(prometheus.CounterOpts{
 			Name: "gubernator_grpc_request_counts",
-			Help: "GRPC requests by status.",
+			Help: "The count of gRPC requests.",
 		}, []string{"status", "method"}),
 		grpcRequestDuration: prometheus.NewSummaryVec(prometheus.SummaryOpts{
 			Name:       "gubernator_grpc_request_duration",
-			Help:       "GRPC request durations in seconds",
-			Objectives: map[float64]float64{0.5: 0.05, 0.99: 0.001},
+			Help:       "The timings of gRPC requests in seconds",
+			Objectives: map[float64]float64{
+				0.5: 0.05,
+				0.99: 0.001,
+			},
 		}, []string{"method"}),
 	}
 	c.run()

diff --git a/gubernator.go b/gubernator.go
@@ -61,7 +61,7 @@ var getRateLimitCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
 }, []string{"calltype"})
 var funcTimeMetric = prometheus.NewSummaryVec(prometheus.SummaryOpts{
 	Name: "gubernator_func_duration",
-	Help: "The timings of key functions in Gubernator.",
+	Help: "The timings of key functions in Gubernator in seconds.",
 	Objectives: map[float64]float64{
 		0.99: 0.001,
 	},

diff --git a/prometheus.md b/prometheus.md
@@ -0,0 +1,33 @@
+# Prometheus Metrics
+Gubernator can be monitored realtime using [Prometheus](https://prometheus.io/) metrics.
+
+## Enabling Metric Collection
+Metrics are exposed under two possible deployment scenarios:
+
+1. Gubernator deployed as a standalone daemon.
+   * Metrics endpoint published at the HTTP `/metrics` URI.
+2. Gubernator embedded as a Go module.
+   * The dependant codebase is responsible for publishing the HTTP `/metrics` URI.
+   * See `daemon.go` for examples using the `promhttp` module.
+
+Finally, configure a Prometheus job to scrape the server's `/metrics` URI.
+
+## Metrics
+
+| Metric                                 | Type    | Description |
+| -------------------------------------- | ------- | ----------- |
+| `gubernator_async_durations`           | Summary | The timings of GLOBAL async sends in seconds. |
+| `gubernator_asyncrequest_retries`      | Counter | The count of retries occurred in asyncRequests() forwarding a request to another peer. |
+| `gubernator_broadcast_durations`       | Summary | The timings of GLOBAL broadcasts to peers in seconds. |
+| `gubernator_cache_access_count`        | Counter | The count of LRUCache accesses during rate checks. |
+| `gubernator_cache_size`                | Gauge   | The number of items in LRU Cache which holds the rate limits. |
+| `gubernator_check_counter`             | Counter | The number of rate limits checked. |
+| `gubernator_check_error_counter`       | Counter | The number of errors while checking rate limits. |
+| `gubernator_concurrent_checks_counter` | Summary | 99th quantile of concurrent rate checks.  This includes rate checks processed locally and forwarded to other peers. |
+| `gubernator_func_duration`             | Summary | The 99th quantile of key function timings in seconds. |
+| `gubernator_getratelimit_counter`      | Counter | The count of getRateLimit() calls.  Label \"calltype\" may be \"local\" for calls handled by the same peer, \"forward\" for calls forwarded to another peer, or \"global\" for global rate limits. |
+| `gubernator_grpc_request_counts`       | Counter | The count of gRPC requests. |
+| `gubernator_grpc_request_duration`     | Summary | The 99th quantile timings of gRPC requests in seconds. |
+| `gubernator_over_limit_counter`        | Counter | The number of rate limit checks that are over the limit. |
+| `gubernator_pool_queue_length`         | Summary | The 99th quantile of rate check requests queued up in GubernatorPool.  The is the work queue for local rate checks. |
+| `gubernator_queue_length`              | Summary | The 99th quantile of rate check requests queued up for batching to other peers by getPeerRateLimitsBatch().  This is the work queue for remote rate checks.  Label "peerAddr" indicates queued requests to that peer. |