Skip to content

Commit 7fc7c74

Browse files
committed
UPSTREAM: <carry>: test: add retry to getMetricsFromNode
1 parent 672cb75 commit 7fc7c74

File tree

2 files changed

+16
-16
lines changed

2 files changed

+16
-16
lines changed

test/e2e/framework/metrics/kubelet_metrics.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ import (
3333
)
3434

3535
const (
36-
proxyTimeout = 2 * time.Minute
3736
// dockerOperationsLatencyKey is the key for the operation latency metrics.
3837
// Taken from k8s.io/kubernetes/pkg/kubelet/dockershim/metrics
3938
dockerOperationsLatencyKey = "docker_operations_duration_seconds"

test/e2e/framework/metrics/metrics_grabber.go

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
v1 "k8s.io/api/core/v1"
3030
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3131
"k8s.io/apimachinery/pkg/fields"
32+
k8snet "k8s.io/apimachinery/pkg/util/net"
3233
"k8s.io/apimachinery/pkg/util/wait"
3334
clientset "k8s.io/client-go/kubernetes"
3435
"k8s.io/client-go/rest"
@@ -94,7 +95,6 @@ type Grabber struct {
9495
// support it. If disabled for a component, the corresponding Grab function
9596
// will immediately return an error derived from MetricsGrabbingDisabledError.
9697
func NewMetricsGrabber(ctx context.Context, c clientset.Interface, ec clientset.Interface, config *rest.Config, kubelets bool, scheduler bool, controllers bool, apiServer bool, clusterAutoscaler bool, snapshotController bool) (*Grabber, error) {
97-
9898
kubeScheduler := ""
9999
kubeControllerManager := ""
100100
snapshotControllerManager := ""
@@ -213,28 +213,29 @@ func (g *Grabber) grabFromKubeletInternal(ctx context.Context, nodeName string,
213213
}
214214

215215
func (g *Grabber) getMetricsFromNode(ctx context.Context, nodeName string, kubeletPort int, pathSuffix string) (string, error) {
216-
// There's a problem with timing out during proxy. Wrapping this in a goroutine to prevent deadlock.
217-
finished := make(chan struct{}, 1)
216+
// There's a problem with timing out during proxy. We are going to set a 45 second client timeout, and issue a retry.
218217
var err error
219-
var rawOutput []byte
220-
go func() {
221-
rawOutput, err = g.client.CoreV1().RESTClient().Get().
218+
var output []byte
219+
err = wait.PollUntilContextTimeout(ctx, 15*time.Second, 2*time.Minute, true, func(ctx context.Context) (done bool, retErr error) {
220+
rawOutput, err := g.client.CoreV1().RESTClient().Get().
222221
Resource("nodes").
223222
SubResource("proxy").
224223
Name(fmt.Sprintf("%v:%v", nodeName, kubeletPort)).
225224
Suffix(pathSuffix).
225+
Timeout(45 * time.Second).
226226
Do(ctx).Raw()
227-
finished <- struct{}{}
228-
}()
229-
select {
230-
case <-time.After(proxyTimeout):
231-
return "", fmt.Errorf("Timed out when waiting for proxy to gather metrics from %v", nodeName)
232-
case <-finished:
233227
if err != nil {
234-
return "", err
228+
if k8snet.IsTimeout(err) {
229+
klog.Warningf("Metrics rest call timed out")
230+
return false, nil
231+
}
232+
klog.Warningf("Metrics rest call errored: %v", err)
233+
return false, nil
235234
}
236-
return string(rawOutput), nil
237-
}
235+
output = rawOutput
236+
return true, nil
237+
})
238+
return string(output), err
238239
}
239240

240241
// GrabFromKubeProxy returns metrics from kube-proxy

0 commit comments

Comments
 (0)