Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion operator/redisfailover/service/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,12 @@ func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redis
}

rport := getRedisPort(rf.Spec.Redis.Port)
var connectivityErrors []error

// This ensures labels are always updated regardless of Redis connectivity issues
for _, rp := range rps.Items {
if rp.Status.PodIP == master {
r.logger.Infof("Update pod label, namespace: %s, pod name: %s, labels: %v", rf.Namespace, rp.Name, generateRedisMasterRoleLabel())
Copy link

Copilot AI Sep 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These debug log messages could generate excessive noise in production. Consider using Debug level instead of Info for label update operations, or add a condition to only log when labels actually change.

Copilot uses AI. Check for mistakes.
err = r.setMasterLabelIfNecessary(rf.Namespace, rp)
if err != nil {
return err
Expand All @@ -166,6 +170,7 @@ func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redis
return err
}
} else {
r.logger.Infof("Update pod label, namespace: %s, pod name: %s, labels: %v", rf.Namespace, rp.Name, generateRedisSlaveRoleLabel())
Copy link

Copilot AI Sep 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These debug log messages could generate excessive noise in production. Consider using Debug level instead of Info for label update operations, or add a condition to only log when labels actually change.

Copilot uses AI. Check for mistakes.
err = r.setSlaveLabelIfNecessary(rf.Namespace, rp)
if err != nil {
return err
Expand All @@ -175,16 +180,34 @@ func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redis
return err
}
}
}

// If any Redis instance is unresponsive, we log the error but don't fail the entire operation
for _, rp := range rps.Items {
slave, err := r.redisClient.GetSlaveOf(rp.Status.PodIP, rport, password)
if err != nil {
r.logger.Errorf("Get slave of master failed, maybe this node is not ready, pod ip: %s", rp.Status.PodIP)
return err
connectivityErrors = append(connectivityErrors, err)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be exposed as metrics, rf_redis_connectivity_error

continue // Continue checking other pods instead of failing immediately
}
if slave != "" && slave != master {
return fmt.Errorf("slave %s don't have the master %s, has %s", rp.Status.PodIP, master, slave)
}
}

// If ALL pods had connectivity errors, return an error
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did not get this! Whats happening here?

// This preserves the original behavior when all pods are unresponsive
if len(connectivityErrors) == len(rps.Items) && len(rps.Items) > 0 {
r.logger.Errorf("All Redis instances were unresponsive during replication check")
return connectivityErrors[0] // Return the first error
}

// If we had some connectivity errors but labels were updated successfully,
// log a warning but allow the healing process to continue
if len(connectivityErrors) > 0 {
r.logger.Warningf("Some Redis instances were unresponsive during replication check, but pod labels were updated successfully. Connectivity errors: %d", len(connectivityErrors))
Copy link

Copilot AI Sep 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The warning message should be more specific about which Redis instances were unresponsive. Consider logging the pod IPs or names that had connectivity issues to aid in troubleshooting.

Copilot uses AI. Check for mistakes.
}

return nil
}

Expand Down Expand Up @@ -324,18 +347,25 @@ func (r *RedisFailoverChecker) GetMasterIP(rf *redisfailoverv1.RedisFailover) (s
}

masters := []string{}
connectivityErrors := 0
rport := getRedisPort(rf.Spec.Redis.Port)
for _, rip := range rips {
master, err := r.redisClient.IsMaster(rip, rport, password)
if err != nil {
r.logger.Errorf("Get redis info failed, maybe this node is not ready, pod ip: %s", rip)
connectivityErrors++
continue
}
if master {
masters = append(masters, rip)
}
}

// If all nodes are unresponsive, return an error
if connectivityErrors == len(rips) {
return "", errors.New("all redis nodes are unresponsive")
}

if len(masters) != 1 {
return "", errors.New("number of redis nodes known as master is different than 1")
}
Expand All @@ -357,17 +387,25 @@ func (r *RedisFailoverChecker) GetNumberMasters(rf *redisfailoverv1.RedisFailove
return nMasters, err
}

connectivityErrors := 0
rport := getRedisPort(rf.Spec.Redis.Port)
for _, rip := range rips {
master, err := r.redisClient.IsMaster(rip, rport, password)
if err != nil {
r.logger.Errorf("Get redis info failed, maybe this node is not ready, pod ip: %s", rip)
connectivityErrors++
continue
}
if master {
nMasters++
}
}

// Log connectivity status for debugging
if connectivityErrors > 0 {
r.logger.Warningf("Found %d masters out of %d responsive nodes (%d nodes had connectivity issues)", nMasters, len(rips)-connectivityErrors, connectivityErrors)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this log warning even mean?

}

return nMasters, nil
}

Expand Down
3 changes: 1 addition & 2 deletions service/redis/client.go
Comment thread
pratheep-kumar marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ import (
"strconv"
"strings"

rediscli "github.com/go-redis/redis/v8"
"github.com/freshworks/redis-operator/log"
"github.com/freshworks/redis-operator/metrics"
rediscli "github.com/go-redis/redis/v8"
)

// Client defines the functions neccesary to connect to redis and sentinel to get or set what we nned
Expand Down Expand Up @@ -162,7 +162,6 @@ func (c *client) ResetSentinel(ip string) error {

// GetSlaveOf returns the master of the given redis, or nil if it's master
func (c *client) GetSlaveOf(ip, port, password string) (string, error) {

options := &rediscli.Options{
Addr: net.JoinHostPort(ip, port),
Password: password,
Expand Down
Loading