Skip to content

Commit

Permalink
[tmpnet] Set an explicit instance label for logs and metrics
Browse files Browse the repository at this point in the history
Previously, the configuuration for promtail and prometheus did not set
an explicit `instance` label. This resulted in the default of `ip:port`
being used, and since that could change on restart, it was necessary
to try to ensure a stable port. Setting an explicit and unique value
for `instance` avoids needing to ensure a stable port.
  • Loading branch information
maru-ava committed Jan 16, 2025
1 parent e973af8 commit 671a3fc
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 33 deletions.
17 changes: 0 additions & 17 deletions tests/fixture/tmpnet/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -510,23 +510,6 @@ func (n *Network) StartNode(ctx context.Context, log logging.Logger, node *Node)

// Restart a single node.
func (n *Network) RestartNode(ctx context.Context, log logging.Logger, node *Node) error {
// Ensure the node reuses the same API port across restarts to ensure
// consistent labeling of metrics. Otherwise prometheus's automatic
// addition of the `instance` label (host:port) results in
// segmentation of results for a given node every time the port
// changes on restart. This segmentation causes graphs on the grafana
// dashboards to display multiple series per graph for a given node,
// one for each port that the node used.
//
// There is a non-zero chance of the port being allocatted to a
// different process and the node subsequently being unable to start,
// but the alternative is having to update the grafana dashboards
// query-by-query to ensure that node metrics ignore the instance
// label.
if err := node.SaveAPIPort(); err != nil {
return err
}

if err := node.Stop(ctx); err != nil {
return fmt.Errorf("failed to stop node %s: %w", node.NodeID, err)
}
Expand Down
22 changes: 6 additions & 16 deletions tests/fixture/tmpnet/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import (
"errors"
"fmt"
"io"
"net"
"net/http"
"net/netip"
"os"
Expand Down Expand Up @@ -366,19 +365,10 @@ func (n *Node) EnsureNodeID() error {
return nil
}

// Saves the currently allocated API port to the node's configuration
// for use across restarts. Reusing the port ensures consistent
// labeling of metrics.
func (n *Node) SaveAPIPort() error {
hostPort := strings.TrimPrefix(n.URI, "http://")
if len(hostPort) == 0 {
// Without an API URI there is nothing to save
return nil
}
_, port, err := net.SplitHostPort(hostPort)
if err != nil {
return err
}
n.Flags[config.HTTPPortKey] = port
return nil
// GetUniqueID returns a globally unique identifier for the node.
func (n *Node) GetUniqueID() string {
nodeIDString := n.NodeID.String()
startIndex := len(ids.NodeIDPrefix)
endIndex := startIndex + 8 // 8 characters should be enough to identify a node in the context of its network
return n.NetworkUUID + "-" + strings.ToLower(nodeIDString[startIndex:endIndex])
}
4 changes: 4 additions & 0 deletions tests/fixture/tmpnet/node_process.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,10 @@ func (p *NodeProcess) getProcess() (*os.Process, error) {
func (p *NodeProcess) writeMonitoringConfig() error {
// Ensure labeling that uniquely identifies the node and its network
commonLabels := FlagsMap{
// Explicitly setting a instance label avoids the use of the
// node's URI which may change on restart and preclude the
// continuity of collected metrics.
"instance": p.node.GetUniqueID(),
"network_uuid": p.node.NetworkUUID,
"node_id": p.node.NodeID,
"is_ephemeral_node": strconv.FormatBool(p.node.IsEphemeral),
Expand Down

0 comments on commit 671a3fc

Please sign in to comment.