Skip to content

Commit cea6dcc

Browse files
committed
Retry reaching the server if there are transient errors
Signed-off-by: Chetan Banavikalmutt <[email protected]>
1 parent eccc58d commit cea6dcc

File tree

2 files changed

+32
-11
lines changed

2 files changed

+32
-11
lines changed

pkg/cache/cluster.go

+24-10
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ type WeightedSemaphore interface {
147147
Release(n int64)
148148
}
149149

150-
type ListRetryFunc func(err error) bool
150+
type RetryFunc func(err error) bool
151151

152152
// NewClusterCache creates new instance of cluster cache
153153
func NewClusterCache(config *rest.Config, opts ...UpdateSettingsFunc) *clusterCache {
@@ -176,9 +176,10 @@ func NewClusterCache(config *rest.Config, opts ...UpdateSettingsFunc) *clusterCa
176176
log: log,
177177
listRetryLimit: 1,
178178
listRetryUseBackoff: false,
179-
listRetryFunc: ListRetryFuncNever,
179+
listRetryFunc: RetryFuncNever,
180180
connectionStatus: ConnectionStatusUnknown,
181181
watchFails: newWatchFailures(),
182+
clusterStatusRetryFunc: RetryFuncNever,
182183
clusterConnectionInterval: defaultClusterConnectionInterval,
183184
}
184185
for i := range opts {
@@ -208,6 +209,8 @@ type clusterCache struct {
208209
// watchFails is used to keep track of the failures while watching resources.
209210
watchFails *watchFailures
210211

212+
clusterStatusRetryFunc RetryFunc
213+
211214
apisMeta map[schema.GroupKind]*apiMeta
212215
serverVersion string
213216
apiResources []kube.APIResourceInfo
@@ -228,7 +231,7 @@ type clusterCache struct {
228231
// retry options for list operations
229232
listRetryLimit int32
230233
listRetryUseBackoff bool
231-
listRetryFunc ListRetryFunc
234+
listRetryFunc RetryFunc
232235

233236
// lock is a rw lock which protects the fields of clusterInfo
234237
lock sync.RWMutex
@@ -264,13 +267,13 @@ type clusterCacheSync struct {
264267
resyncTimeout time.Duration
265268
}
266269

267-
// ListRetryFuncNever never retries on errors
268-
func ListRetryFuncNever(err error) bool {
270+
// RetryFuncNever never retries on errors
271+
func RetryFuncNever(err error) bool {
269272
return false
270273
}
271274

272-
// ListRetryFuncAlways always retries on errors
273-
func ListRetryFuncAlways(err error) bool {
275+
// RetryFuncAlways always retries on errors
276+
func RetryFuncAlways(err error) bool {
274277
return true
275278
}
276279

@@ -1247,6 +1250,10 @@ func (c *clusterCache) StartClusterConnectionStatusMonitoring(ctx context.Contex
12471250
}
12481251

12491252
func (c *clusterCache) clusterConnectionService(ctx context.Context) {
1253+
if c.clusterConnectionInterval <= 0 {
1254+
return
1255+
}
1256+
12501257
ticker := time.NewTicker(c.clusterConnectionInterval)
12511258
defer ticker.Stop()
12521259

@@ -1267,16 +1274,23 @@ func (c *clusterCache) clusterConnectionService(ctx context.Context) {
12671274
}
12681275

12691276
if watchErrors > 0 || watchesRecovered {
1270-
c.log.V(1).Info("verifying cluster connection", "watches", watchErrors)
1271-
1272-
_, err := c.kubectl.GetServerVersion(c.config)
1277+
c.log.V(1).Info("verifying cluster connection", "server", c.config.Host)
1278+
// Retry fetching the server version to avoid invalidating the cache due to transient errors.
1279+
err := retry.OnError(retry.DefaultBackoff, c.clusterStatusRetryFunc, func() error {
1280+
_, err := c.kubectl.GetServerVersion(c.config)
1281+
if err != nil && c.clusterStatusRetryFunc(err) {
1282+
c.log.V(1).Info("Error while fetching server version", "error", err.Error())
1283+
}
1284+
return err
1285+
})
12731286
if err != nil {
12741287
c.updateConnectionStatus(ConnectionStatusFailed)
12751288
} else {
12761289
c.updateConnectionStatus(ConnectionStatusSuccessful)
12771290
}
12781291
}
12791292
case <-ctx.Done():
1293+
c.log.V(1).Info("Stopping cluster connection status monitoring", "server", c.config.Host)
12801294
ticker.Stop()
12811295
return
12821296
}

pkg/cache/settings.go

+8-1
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ func SetTracer(tracer tracing.Tracer) UpdateSettingsFunc {
147147
}
148148

149149
// SetRetryOptions sets cluster list retry options
150-
func SetRetryOptions(maxRetries int32, useBackoff bool, retryFunc ListRetryFunc) UpdateSettingsFunc {
150+
func SetRetryOptions(maxRetries int32, useBackoff bool, retryFunc RetryFunc) UpdateSettingsFunc {
151151
return func(cache *clusterCache) {
152152
// Max retries must be at least one
153153
if maxRetries < 1 {
@@ -177,3 +177,10 @@ func SetClusterConnectionInterval(interval time.Duration) UpdateSettingsFunc {
177177
cache.clusterConnectionInterval = interval
178178
}
179179
}
180+
181+
// SetClusterStatusRetryFunc sets the retry function for monitoring the cluster connection status.
182+
func SetClusterStatusRetryFunc(retryFunc RetryFunc) UpdateSettingsFunc {
183+
return func(cache *clusterCache) {
184+
cache.clusterStatusRetryFunc = retryFunc
185+
}
186+
}

0 commit comments

Comments
 (0)