Skip to content

Commit b3e1c67

Browse files
committed
Retry reaching the server if there are transient errors
Signed-off-by: Chetan Banavikalmutt <[email protected]>
1 parent c129232 commit b3e1c67

File tree

2 files changed

+32
-11
lines changed

2 files changed

+32
-11
lines changed

pkg/cache/cluster.go

+24-10
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ type WeightedSemaphore interface {
147147
Release(n int64)
148148
}
149149

150-
type ListRetryFunc func(err error) bool
150+
type RetryFunc func(err error) bool
151151

152152
// NewClusterCache creates new instance of cluster cache
153153
func NewClusterCache(config *rest.Config, opts ...UpdateSettingsFunc) *clusterCache {
@@ -176,9 +176,10 @@ func NewClusterCache(config *rest.Config, opts ...UpdateSettingsFunc) *clusterCa
176176
log: log,
177177
listRetryLimit: 1,
178178
listRetryUseBackoff: false,
179-
listRetryFunc: ListRetryFuncNever,
179+
listRetryFunc: RetryFuncNever,
180180
connectionStatus: ConnectionStatusUnknown,
181181
watchFails: newWatchFailures(),
182+
clusterStatusRetryFunc: RetryFuncNever,
182183
clusterConnectionInterval: defaultClusterConnectionInterval,
183184
}
184185
for i := range opts {
@@ -208,6 +209,8 @@ type clusterCache struct {
208209
// watchFails is used to keep track of the failures while watching resources.
209210
watchFails *watchFailures
210211

212+
clusterStatusRetryFunc RetryFunc
213+
211214
apisMeta map[schema.GroupKind]*apiMeta
212215
serverVersion string
213216
apiResources []kube.APIResourceInfo
@@ -228,7 +231,7 @@ type clusterCache struct {
228231
// retry options for list operations
229232
listRetryLimit int32
230233
listRetryUseBackoff bool
231-
listRetryFunc ListRetryFunc
234+
listRetryFunc RetryFunc
232235

233236
// lock is a rw lock which protects the fields of clusterInfo
234237
lock sync.RWMutex
@@ -264,13 +267,13 @@ type clusterCacheSync struct {
264267
resyncTimeout time.Duration
265268
}
266269

267-
// ListRetryFuncNever never retries on errors
268-
func ListRetryFuncNever(err error) bool {
270+
// RetryFuncNever never retries on errors
271+
func RetryFuncNever(err error) bool {
269272
return false
270273
}
271274

272-
// ListRetryFuncAlways always retries on errors
273-
func ListRetryFuncAlways(err error) bool {
275+
// RetryFuncAlways always retries on errors
276+
func RetryFuncAlways(err error) bool {
274277
return true
275278
}
276279

@@ -1248,6 +1251,10 @@ func (c *clusterCache) StartClusterConnectionStatusMonitoring(ctx context.Contex
12481251
}
12491252

12501253
func (c *clusterCache) clusterConnectionService(ctx context.Context) {
1254+
if c.clusterConnectionInterval <= 0 {
1255+
return
1256+
}
1257+
12511258
ticker := time.NewTicker(c.clusterConnectionInterval)
12521259
defer ticker.Stop()
12531260

@@ -1268,16 +1275,23 @@ func (c *clusterCache) clusterConnectionService(ctx context.Context) {
12681275
}
12691276

12701277
if watchErrors > 0 || watchesRecovered {
1271-
c.log.V(1).Info("verifying cluster connection", "watches", watchErrors)
1272-
1273-
_, err := c.kubectl.GetServerVersion(c.config)
1278+
c.log.V(1).Info("verifying cluster connection", "server", c.config.Host)
1279+
// Retry fetching the server version to avoid invalidating the cache due to transient errors.
1280+
err := retry.OnError(retry.DefaultBackoff, c.clusterStatusRetryFunc, func() error {
1281+
_, err := c.kubectl.GetServerVersion(c.config)
1282+
if err != nil && c.clusterStatusRetryFunc(err) {
1283+
c.log.V(1).Info("Error while fetching server version", "error", err.Error())
1284+
}
1285+
return err
1286+
})
12741287
if err != nil {
12751288
c.updateConnectionStatus(ConnectionStatusFailed)
12761289
} else {
12771290
c.updateConnectionStatus(ConnectionStatusSuccessful)
12781291
}
12791292
}
12801293
case <-ctx.Done():
1294+
c.log.V(1).Info("Stopping cluster connection status monitoring", "server", c.config.Host)
12811295
ticker.Stop()
12821296
return
12831297
}

pkg/cache/settings.go

+8-1
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ func SetTracer(tracer tracing.Tracer) UpdateSettingsFunc {
147147
}
148148

149149
// SetRetryOptions sets cluster list retry options
150-
func SetRetryOptions(maxRetries int32, useBackoff bool, retryFunc ListRetryFunc) UpdateSettingsFunc {
150+
func SetRetryOptions(maxRetries int32, useBackoff bool, retryFunc RetryFunc) UpdateSettingsFunc {
151151
return func(cache *clusterCache) {
152152
// Max retries must be at least one
153153
if maxRetries < 1 {
@@ -177,3 +177,10 @@ func SetClusterConnectionInterval(interval time.Duration) UpdateSettingsFunc {
177177
cache.clusterConnectionInterval = interval
178178
}
179179
}
180+
181+
// SetClusterStatusRetryFunc sets the retry function for monitoring the cluster connection status.
182+
func SetClusterStatusRetryFunc(retryFunc RetryFunc) UpdateSettingsFunc {
183+
return func(cache *clusterCache) {
184+
cache.clusterStatusRetryFunc = retryFunc
185+
}
186+
}

0 commit comments

Comments
 (0)