Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 47 additions & 28 deletions command/agent/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,13 @@ type HTTPServer struct {
// NewHTTPServers starts an HTTP server for every address.http configured in
// the agent.
func NewHTTPServers(agent *Agent, config *Config) ([]*HTTPServer, error) {
var srvs []*HTTPServer
var serverInitializationErrors error
var (
srvs []*HTTPServer
serverInitializationErrors error

connCount int = 0
countMux *sync.Mutex = &sync.Mutex{}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding a lock to the ConnState callback could potentially introduce some contention if there are many incoming requests and a short telemetry.collectionInterval. This may be worth running some benchmarks.

)

// Get connection handshake timeout limit
handshakeTimeout, err := time.ParseDuration(config.Limits.HTTPSHandshakeTimeout)
Expand Down Expand Up @@ -185,7 +190,7 @@ func NewHTTPServers(agent *Agent, config *Config) ([]*HTTPServer, error) {
httpServer := http.Server{
Addr: srv.Addr,
Handler: handlers.CompressHandler(srv.mux),
ConnState: makeConnState(config.TLSConfig.EnableHTTP, handshakeTimeout, maxConns, srv.logger),
ConnState: makeConnState(config.TLSConfig.EnableHTTP, handshakeTimeout, maxConns, &connCount, countMux, srv.logger),
ErrorLog: newHTTPServerLogger(srv.logger),
}

Expand All @@ -197,6 +202,19 @@ func NewHTTPServers(agent *Agent, config *Config) ([]*HTTPServer, error) {
srvs = append(srvs, srv)
}

go func() {
ticker := time.NewTicker(config.Telemetry.collectionInterval)
defer ticker.Stop()
for {
<-ticker.C

// lock connCount to avoid torn reads, as this is updated by ConnState callbacks
countMux.Lock()
metrics.SetGauge([]string{"nomad", "agent", "http", "connections"}, float32(connCount))
countMux.Unlock()
}
}()

// Return early on errors
if serverInitializationErrors != nil {
for _, srv := range srvs {
Expand Down Expand Up @@ -250,44 +268,36 @@ func NewHTTPServers(agent *Agent, config *Config) ([]*HTTPServer, error) {
//
// If limit > 0, a per-address connection limit will be enabled regardless of
// TLS. If connLimit == 0 there is no connection limit.
func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int, logger log.Logger) func(conn net.Conn, state http.ConnState) {
func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int, connCount *int, connMux *sync.Mutex, logger log.Logger) func(conn net.Conn, state http.ConnState) {
connLimiter := connLimiter(connLimit, logger)
if !isTLS || handshakeTimeout == 0 {
if connLimit > 0 {
// Still return the connection limiter
return connLimiter
}
return nil
}

if connLimit > 0 {
// Return conn state callback with connection limiting and a
// handshake timeout.

return func(conn net.Conn, state http.ConnState) {
connMux.Lock()

switch state {
case http.StateNew:
// Set deadline to prevent slow send before TLS handshake or first
// byte of request.
conn.SetDeadline(time.Now().Add(handshakeTimeout))
case http.StateActive:
// Clear read deadline. We should maybe set read timeouts more
// generally but that's a bigger task as some HTTP endpoints may
// stream large requests and responses (e.g. snapshot) so we can't
// set sensible blanket timeouts here.
conn.SetDeadline(time.Time{})
*connCount++
case http.StateClosed:
*connCount--
}

// Call connection limiter
connLimiter(conn, state)
connMux.Unlock()

// Call connection limiter if enabled
if connLimit > 0 {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're already introducing some closures here with the connCount and mutex, adding the connLimit as one more closure allows us to condense the logic a bit.

connLimiter(conn, state)
}
}
}

// Return conn state callback with just a handshake timeout
// (connection limiting disabled).
// Return conn state callback with connection limiting and a
// handshake timeout.
return func(conn net.Conn, state http.ConnState) {
connMux.Lock()

switch state {
case http.StateNew:
*connCount++
// Set deadline to prevent slow send before TLS handshake or first
// byte of request.
conn.SetDeadline(time.Now().Add(handshakeTimeout))
Expand All @@ -297,6 +307,15 @@ func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int, lo
// stream large requests and responses (e.g. snapshot) so we can't
// set sensible blanket timeouts here.
conn.SetDeadline(time.Time{})
case http.StateClosed:
*connCount--
}

connMux.Unlock()

// Call connection limiter if enabled
if connLimit > 0 {
connLimiter(conn, state)
}
}
}
Expand Down
18 changes: 10 additions & 8 deletions website/content/docs/reference/metrics.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -533,16 +533,18 @@ Agent metrics are emitted by all Nomad agents running in either client or server

| Metric | Description | Unit | Type | Labels |
|-----------------------------------------------|--------------------------------------------------------|---------|---------|--------|
| `nomad.agent.http.exceeded` | Count of HTTP connections exceeding concurrency limit | Integer | Counter | - |
| `nomad.client.consul.check_deregistrations` | Number of Consul check deregistration operations | Integer | Counter | host |
| `nomad.client.consul.check_registrations` | Number of Consul check registration operations | Integer | Counter | host |
| `nomad.client.consul.checks` | Number of Consul checks currently registered | Integer | Gauge | host |
| `nomad.client.consul.service_deregistrations` | Number of Consul service deregistration operations | Integer | Counter | host |
| `nomad.client.consul.service_registrations` | Number of Consul service registration operations | Integer | Counter | host |
| `nomad.client.consul.services` | Number of Consul services currently registered | Integer | Gauge | host |
| `nomad.client.consul.sync_failure` | Number of failed attempts to sync services with Consul | Integer | Counter | host |
| `nomad.agent.http.exceeded` | Count of HTTP connections exceeding concurrency limit | Integer | Counter | - |
| `nomad.agent.http.connections` | Count of active HTTP connections across all [bound addresses] | Integer | Gauge | - |
| `nomad.client.consul.check_deregistrations` | Number of Consul check deregistration operations | Integer | Counter | host |
| `nomad.client.consul.check_registrations` | Number of Consul check registration operations | Integer | Counter | host |
| `nomad.client.consul.checks` | Number of Consul checks currently registered | Integer | Gauge | host |
| `nomad.client.consul.service_deregistrations` | Number of Consul service deregistration operations | Integer | Counter | host |
| `nomad.client.consul.service_registrations` | Number of Consul service registration operations | Integer | Counter | host |
| `nomad.client.consul.services` | Number of Consul services currently registered | Integer | Gauge | host |
| `nomad.client.consul.sync_failure` | Number of failed attempts to sync services with Consul | Integer | Counter | host |

[tagged-metrics]: /nomad/docs/reference/metrics#tagged-metrics
[sticky]: /nomad/docs/job-specification/ephemeral_disk#sticky
[s_port_plan_failure]: https://developer.hashicorp.com/nomad/s/port-plan-failure
[`disable_allocation_hook_metrics`]: /nomad/docs/configuration/telemetry#disable_allocation_hook_metrics
[bound addresses]: /nomad/docs/configuration#bind_addr