diff --git a/command/agent/http.go b/command/agent/http.go index a66ea8385b8..3183ec90488 100644 --- a/command/agent/http.go +++ b/command/agent/http.go @@ -115,8 +115,13 @@ type HTTPServer struct { // NewHTTPServers starts an HTTP server for every address.http configured in // the agent. func NewHTTPServers(agent *Agent, config *Config) ([]*HTTPServer, error) { - var srvs []*HTTPServer - var serverInitializationErrors error + var ( + srvs []*HTTPServer + serverInitializationErrors error + + connCount int = 0 + countMux *sync.Mutex = &sync.Mutex{} + ) // Get connection handshake timeout limit handshakeTimeout, err := time.ParseDuration(config.Limits.HTTPSHandshakeTimeout) @@ -185,7 +190,7 @@ func NewHTTPServers(agent *Agent, config *Config) ([]*HTTPServer, error) { httpServer := http.Server{ Addr: srv.Addr, Handler: handlers.CompressHandler(srv.mux), - ConnState: makeConnState(config.TLSConfig.EnableHTTP, handshakeTimeout, maxConns, srv.logger), + ConnState: makeConnState(config.TLSConfig.EnableHTTP, handshakeTimeout, maxConns, &connCount, countMux, srv.logger), ErrorLog: newHTTPServerLogger(srv.logger), } @@ -197,6 +202,19 @@ func NewHTTPServers(agent *Agent, config *Config) ([]*HTTPServer, error) { srvs = append(srvs, srv) } + go func() { + ticker := time.NewTicker(config.Telemetry.collectionInterval) + defer ticker.Stop() + for { + <-ticker.C + + // lock connCount to avoid torn reads, as this is updated by ConnState callbacks + countMux.Lock() + metrics.SetGauge([]string{"nomad", "agent", "http", "connections"}, float32(connCount)) + countMux.Unlock() + } + }() + // Return early on errors if serverInitializationErrors != nil { for _, srv := range srvs { @@ -250,44 +268,36 @@ func NewHTTPServers(agent *Agent, config *Config) ([]*HTTPServer, error) { // // If limit > 0, a per-address connection limit will be enabled regardless of // TLS. If connLimit == 0 there is no connection limit. -func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int, logger log.Logger) func(conn net.Conn, state http.ConnState) { +func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int, connCount *int, connMux *sync.Mutex, logger log.Logger) func(conn net.Conn, state http.ConnState) { connLimiter := connLimiter(connLimit, logger) if !isTLS || handshakeTimeout == 0 { - if connLimit > 0 { - // Still return the connection limiter - return connLimiter - } - return nil - } - - if connLimit > 0 { - // Return conn state callback with connection limiting and a - // handshake timeout. - return func(conn net.Conn, state http.ConnState) { + connMux.Lock() + switch state { case http.StateNew: - // Set deadline to prevent slow send before TLS handshake or first - // byte of request. - conn.SetDeadline(time.Now().Add(handshakeTimeout)) - case http.StateActive: - // Clear read deadline. We should maybe set read timeouts more - // generally but that's a bigger task as some HTTP endpoints may - // stream large requests and responses (e.g. snapshot) so we can't - // set sensible blanket timeouts here. - conn.SetDeadline(time.Time{}) + *connCount++ + case http.StateClosed: + *connCount-- } - // Call connection limiter - connLimiter(conn, state) + connMux.Unlock() + + // Call connection limiter if enabled + if connLimit > 0 { + connLimiter(conn, state) + } } } - // Return conn state callback with just a handshake timeout - // (connection limiting disabled). + // Return conn state callback with connection limiting and a + // handshake timeout. return func(conn net.Conn, state http.ConnState) { + connMux.Lock() + switch state { case http.StateNew: + *connCount++ // Set deadline to prevent slow send before TLS handshake or first // byte of request. conn.SetDeadline(time.Now().Add(handshakeTimeout)) @@ -297,6 +307,15 @@ func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int, lo // stream large requests and responses (e.g. snapshot) so we can't // set sensible blanket timeouts here. conn.SetDeadline(time.Time{}) + case http.StateClosed: + *connCount-- + } + + connMux.Unlock() + + // Call connection limiter if enabled + if connLimit > 0 { + connLimiter(conn, state) } } } diff --git a/website/content/docs/reference/metrics.mdx b/website/content/docs/reference/metrics.mdx index dd5b0728f59..bf16fd3461b 100644 --- a/website/content/docs/reference/metrics.mdx +++ b/website/content/docs/reference/metrics.mdx @@ -533,16 +533,18 @@ Agent metrics are emitted by all Nomad agents running in either client or server | Metric | Description | Unit | Type | Labels | |-----------------------------------------------|--------------------------------------------------------|---------|---------|--------| -| `nomad.agent.http.exceeded` | Count of HTTP connections exceeding concurrency limit | Integer | Counter | - | -| `nomad.client.consul.check_deregistrations` | Number of Consul check deregistration operations | Integer | Counter | host | -| `nomad.client.consul.check_registrations` | Number of Consul check registration operations | Integer | Counter | host | -| `nomad.client.consul.checks` | Number of Consul checks currently registered | Integer | Gauge | host | -| `nomad.client.consul.service_deregistrations` | Number of Consul service deregistration operations | Integer | Counter | host | -| `nomad.client.consul.service_registrations` | Number of Consul service registration operations | Integer | Counter | host | -| `nomad.client.consul.services` | Number of Consul services currently registered | Integer | Gauge | host | -| `nomad.client.consul.sync_failure` | Number of failed attempts to sync services with Consul | Integer | Counter | host | +| `nomad.agent.http.exceeded` | Count of HTTP connections exceeding concurrency limit | Integer | Counter | - | +| `nomad.agent.http.connections` | Count of active HTTP connections across all [bound addresses] | Integer | Gauge | - | +| `nomad.client.consul.check_deregistrations` | Number of Consul check deregistration operations | Integer | Counter | host | +| `nomad.client.consul.check_registrations` | Number of Consul check registration operations | Integer | Counter | host | +| `nomad.client.consul.checks` | Number of Consul checks currently registered | Integer | Gauge | host | +| `nomad.client.consul.service_deregistrations` | Number of Consul service deregistration operations | Integer | Counter | host | +| `nomad.client.consul.service_registrations` | Number of Consul service registration operations | Integer | Counter | host | +| `nomad.client.consul.services` | Number of Consul services currently registered | Integer | Gauge | host | +| `nomad.client.consul.sync_failure` | Number of failed attempts to sync services with Consul | Integer | Counter | host | [tagged-metrics]: /nomad/docs/reference/metrics#tagged-metrics [sticky]: /nomad/docs/job-specification/ephemeral_disk#sticky [s_port_plan_failure]: https://developer.hashicorp.com/nomad/s/port-plan-failure [`disable_allocation_hook_metrics`]: /nomad/docs/configuration/telemetry#disable_allocation_hook_metrics +[bound addresses]: /nomad/docs/configuration#bind_addr