Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tmpnet] Set an explicit instance label for logs and metrics #3650

Merged
merged 2 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 0 additions & 17 deletions tests/fixture/tmpnet/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -510,23 +510,6 @@ func (n *Network) StartNode(ctx context.Context, log logging.Logger, node *Node)

// Restart a single node.
func (n *Network) RestartNode(ctx context.Context, log logging.Logger, node *Node) error {
// Ensure the node reuses the same API port across restarts to ensure
// consistent labeling of metrics. Otherwise prometheus's automatic
// addition of the `instance` label (host:port) results in
// segmentation of results for a given node every time the port
// changes on restart. This segmentation causes graphs on the grafana
// dashboards to display multiple series per graph for a given node,
// one for each port that the node used.
//
// There is a non-zero chance of the port being allocatted to a
// different process and the node subsequently being unable to start,
// but the alternative is having to update the grafana dashboards
// query-by-query to ensure that node metrics ignore the instance
// label.
if err := node.SaveAPIPort(); err != nil {
return err
}

if err := node.Stop(ctx); err != nil {
return fmt.Errorf("failed to stop node %s: %w", node.NodeID, err)
}
Expand Down
22 changes: 6 additions & 16 deletions tests/fixture/tmpnet/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import (
"errors"
"fmt"
"io"
"net"
"net/http"
"net/netip"
"os"
Expand Down Expand Up @@ -366,19 +365,10 @@ func (n *Node) EnsureNodeID() error {
return nil
}

// Saves the currently allocated API port to the node's configuration
// for use across restarts. Reusing the port ensures consistent
// labeling of metrics.
func (n *Node) SaveAPIPort() error {
hostPort := strings.TrimPrefix(n.URI, "http://")
if len(hostPort) == 0 {
// Without an API URI there is nothing to save
return nil
}
_, port, err := net.SplitHostPort(hostPort)
if err != nil {
return err
}
n.Flags[config.HTTPPortKey] = port
return nil
// GetUniqueID returns a globally unique identifier for the node.
func (n *Node) GetUniqueID() string {
nodeIDString := n.NodeID.String()
startIndex := len(ids.NodeIDPrefix)
endIndex := startIndex + 8 // 8 characters should be enough to identify a node in the context of its network
return n.NetworkUUID + "-" + strings.ToLower(nodeIDString[startIndex:endIndex])
}
4 changes: 4 additions & 0 deletions tests/fixture/tmpnet/node_process.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,10 @@ func (p *NodeProcess) getProcess() (*os.Process, error) {
func (p *NodeProcess) writeMonitoringConfig() error {
// Ensure labeling that uniquely identifies the node and its network
commonLabels := FlagsMap{
// Explicitly setting an instance label avoids the default
// behavior of using the node's URI since the URI isn't
// guaranteed stable (e.g. port may change after restart).
"instance": p.node.GetUniqueID(),
"network_uuid": p.node.NetworkUUID,
"node_id": p.node.NodeID,
"is_ephemeral_node": strconv.FormatBool(p.node.IsEphemeral),
Expand Down
Loading