valkey-io · srikar-jilugu · Jun 19, 2025 · Jun 19, 2025 · Jun 24, 2025 · rueian
diff --git a/client_test.go b/client_test.go
@@ -38,6 +38,12 @@ type mockConn struct {
 	ReceiveOverride map[string]func(ctx context.Context, subscribe Completed, fn func(message PubSubMessage)) error
 }
 
+func (m *mockConn) SetServerUnHealthy() {}
+
+func (m *mockConn) IsServerUnHealthy() bool {
+	return true
+}
+
 func (m *mockConn) Override(c conn) {
 	if m.OverrideFn != nil {
 		m.OverrideFn(c)

diff --git a/cluster.go b/cluster.go
@@ -206,15 +206,28 @@ func (c *clusterClient) _refresh() (err error) {
 	pending = nil
 
 	groups := result.parse(c.opt.TLSConfig != nil)
+
 	conns := make(map[string]connrole, len(groups))
 	for master, g := range groups {
 		conns[master] = connrole{conn: c.connFn(master, c.opt)}
 		if c.rOpt != nil {
 			for _, nodeInfo := range g.nodes[1:] {
+				// do not include unhealhty connections in this refresh cycle
+				if cc, ok := c.conns[nodeInfo.Addr]; ok {
+					if cc.conn.IsServerUnHealthy() {
+						continue
+					}
+				}
 				conns[nodeInfo.Addr] = connrole{conn: c.connFn(nodeInfo.Addr, c.rOpt)}
 			}
 		} else {
 			for _, nodeInfo := range g.nodes[1:] {
+				// do not include unhealhty connections in this refresh cycle
+				if cc, ok := c.conns[nodeInfo.Addr]; ok {
+					if cc.conn.IsServerUnHealthy() {
+						continue
+					}
+				}
 pslots[i] = conns[g.nodes[1+util.FastRand(nodesCount-1)].Addr].conn 
 rslots[i] = conns[g.nodes[1+rIndex].Addr].conn 
 conns[master] = connrole{conn: c.connFn(master, c.opt)} 
 c.mu.RLock() 
 for addr, cc := range c.conns { 
 	if fresh, ok := conns[addr]; ok { 
 		fresh.conn = cc.conn 
 		conns[addr] = fresh 
 	} else { 
 		removes = append(removes, cc.conn) 
 	} 
 } 
 c.mu.RUnlock() 
 pslots[i] = conns[g.nodes[1+util.FastRand(nodesCount-1)].Addr].conn 
 rslots[i] = conns[g.nodes[1+rIndex].Addr].conn 
 conns[master] = connrole{conn: c.connFn(master, c.opt)} 
 c.mu.RLock() 
 for addr, cc := range c.conns { 
 	if fresh, ok := conns[addr]; ok { 
 		fresh.conn = cc.conn 
 		conns[addr] = fresh 
 	} else { 
 		removes = append(removes, cc.conn) 
 	} 
 } 
 c.mu.RUnlock() 
 				conns[nodeInfo.Addr] = connrole{conn: c.connFn(nodeInfo.Addr, c.opt)}
 			}
 		}
@@ -524,6 +537,11 @@ process:
 		resp = results.s[1]
 		resultsp.Put(results)
 		goto process
+	case RedirectLoadingRetry:
+		// mark the associated node temporarily unhealthy
+		cc.SetServerUnHealthy()
+		c.refresh(ctx) // on-demand refresh
+		fallthrough
 	case RedirectRetry:
 		if c.retry && cmd.IsReadOnly() {
 			shouldRetry := c.retryHandler.WaitOrSkipRetry(ctx, attempts, cmd, resp.Error())
@@ -1229,8 +1247,13 @@ func (c *clusterClient) shouldRefreshRetry(err error, ctx context.Context) (addr
 				mode = RedirectMove
 			} else if addr, ok = err.IsAsk(); ok {
 				mode = RedirectAsk
-			} else if err.IsClusterDown() || err.IsTryAgain() || err.IsLoading() {
+			} else if err.IsClusterDown() || err.IsTryAgain() {
 				mode = RedirectRetry
+				// if err.IsLoading() {
+				// 	c.refresh(ctx) // refresh the cluster topology if loading.
+				// }
+			} else if err.IsLoading() {
+				mode = RedirectLoadingRetry
 			}
 		} else if ctx.Err() == nil {
 			mode = RedirectRetry
@@ -1447,6 +1470,7 @@ const (
 	RedirectMove
 	RedirectAsk
 	RedirectRetry
+	RedirectLoadingRetry
 
 	panicMsgCxSlot = "cross slot command in Dedicated is prohibited"
 	panicMixCxSlot = "Mixing no-slot and cross slot commands in DoMulti is prohibited"

diff --git a/mux.go b/mux.go
@@ -43,6 +43,8 @@ type conn interface {
 	Addr() string
 	SetOnCloseHook(func(error))
 	OptInCmd() cmds.Completed
+	SetServerUnHealthy()
+	IsServerUnHealthy() bool
 }
 
 var _ conn = (*mux)(nil)
@@ -61,8 +63,10 @@ type mux struct {
 	maxp   int
 	maxm   int
 
-	usePool bool
-	optIn   bool
+	usePool                bool
+	optIn                  bool
+	serverUnhealthystatus  atomic.Uint32
+	unhealthyServerTimeout time.Duration
 }
 
 func makeMux(dst string, option *ClientOption, dialFn dialFn) *mux {
@@ -107,6 +111,7 @@ func newMux(dst string, option *ClientOption, init, dead wire, wireFn wireFn, wi
 
 	m.dpool = newPool(option.BlockingPoolSize, dead, option.BlockingPoolCleanup, option.BlockingPoolMinSize, wireFn)
 	m.spool = newPool(option.BlockingPoolSize, dead, option.BlockingPoolCleanup, option.BlockingPoolMinSize, wireNoBgFn)
+	m.unhealthyServerTimeout = option.UnHealthyNodeInterval
 	return m
 }
 
@@ -411,6 +416,23 @@ func (m *mux) Addr() string {
 	return m.dst
 }
 
+func (m *mux) SetServerUnHealthy() {
+	m.serverUnhealthystatus.Store(uint32(time.Now().Unix()))
+}
+
+func (m *mux) IsServerUnHealthy() bool {
+
+	unhealthy := m.serverUnhealthystatus.Load()
+	if unhealthy == 0 {
+		return false
+	}
+	if time.Now().Unix()-int64(unhealthy) < int64(m.unhealthyServerTimeout) {
+		return true
+	}
+	m.serverUnhealthystatus.Store(0) // reset the status if the timeout has passed
+	return false
+}
+
 func isBroken(err error, w wire) bool {
 	return err != nil && err != ErrClosing && w.Error() != nil
 }

diff --git a/valkey.go b/valkey.go
@@ -239,6 +239,10 @@ type ClientOption struct {
 	// EnableReplicaAZInfo enables the client to load the replica node's availability zone.
 	// If true, the client will set the `AZ` field in `ReplicaInfo`.
 	EnableReplicaAZInfo bool
+
+	// The interval to mark a node as unhealty during temporary degraded state, defaults to 15 seconds
+	// Note: Only used for cluster client.
+	UnHealthyNodeInterval time.Duration
 }
 
 // SentinelOption contains MasterSet,
@@ -442,6 +446,9 @@ func NewClient(option ClientOption) (client Client, err error) {
 	if option.RetryDelay == nil {
 		option.RetryDelay = defaultRetryDelayFn
 	}
+	if option.UnHealthyNodeInterval == 0 {
+		option.UnHealthyNodeInterval = 15 * time.Second // default to 15 seconds
+	}
 	if option.Sentinel.MasterSet != "" {
 		option.PipelineMultiplex = singleClientMultiplex(option.PipelineMultiplex)
 		return newSentinelClient(&option, makeConn, newRetryer(option.RetryDelay))