-
Notifications
You must be signed in to change notification settings - Fork 12
Update role label irrespective of connection failure from operator #38
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -155,8 +155,12 @@ func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redis | |
| } | ||
|
|
||
| rport := getRedisPort(rf.Spec.Redis.Port) | ||
| var connectivityErrors []error | ||
|
|
||
| // This ensures labels are always updated regardless of Redis connectivity issues | ||
| for _, rp := range rps.Items { | ||
| if rp.Status.PodIP == master { | ||
| r.logger.Infof("Update pod label, namespace: %s, pod name: %s, labels: %v", rf.Namespace, rp.Name, generateRedisMasterRoleLabel()) | ||
| err = r.setMasterLabelIfNecessary(rf.Namespace, rp) | ||
| if err != nil { | ||
| return err | ||
|
|
@@ -166,6 +170,7 @@ func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redis | |
| return err | ||
| } | ||
| } else { | ||
| r.logger.Infof("Update pod label, namespace: %s, pod name: %s, labels: %v", rf.Namespace, rp.Name, generateRedisSlaveRoleLabel()) | ||
|
||
| err = r.setSlaveLabelIfNecessary(rf.Namespace, rp) | ||
| if err != nil { | ||
| return err | ||
|
|
@@ -175,16 +180,34 @@ func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redis | |
| return err | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // If any Redis instance is unresponsive, we log the error but don't fail the entire operation | ||
| for _, rp := range rps.Items { | ||
| slave, err := r.redisClient.GetSlaveOf(rp.Status.PodIP, rport, password) | ||
| if err != nil { | ||
| r.logger.Errorf("Get slave of master failed, maybe this node is not ready, pod ip: %s", rp.Status.PodIP) | ||
| return err | ||
| connectivityErrors = append(connectivityErrors, err) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this be exposed as metrics, rf_redis_connectivity_error |
||
| continue // Continue checking other pods instead of failing immediately | ||
| } | ||
| if slave != "" && slave != master { | ||
| return fmt.Errorf("slave %s don't have the master %s, has %s", rp.Status.PodIP, master, slave) | ||
| } | ||
| } | ||
|
|
||
| // If ALL pods had connectivity errors, return an error | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did not get this! Whats happening here? |
||
| // This preserves the original behavior when all pods are unresponsive | ||
| if len(connectivityErrors) == len(rps.Items) && len(rps.Items) > 0 { | ||
| r.logger.Errorf("All Redis instances were unresponsive during replication check") | ||
| return connectivityErrors[0] // Return the first error | ||
| } | ||
|
|
||
| // If we had some connectivity errors but labels were updated successfully, | ||
| // log a warning but allow the healing process to continue | ||
| if len(connectivityErrors) > 0 { | ||
| r.logger.Warningf("Some Redis instances were unresponsive during replication check, but pod labels were updated successfully. Connectivity errors: %d", len(connectivityErrors)) | ||
|
||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
|
|
@@ -324,18 +347,25 @@ func (r *RedisFailoverChecker) GetMasterIP(rf *redisfailoverv1.RedisFailover) (s | |
| } | ||
|
|
||
| masters := []string{} | ||
| connectivityErrors := 0 | ||
| rport := getRedisPort(rf.Spec.Redis.Port) | ||
| for _, rip := range rips { | ||
| master, err := r.redisClient.IsMaster(rip, rport, password) | ||
| if err != nil { | ||
| r.logger.Errorf("Get redis info failed, maybe this node is not ready, pod ip: %s", rip) | ||
| connectivityErrors++ | ||
| continue | ||
| } | ||
| if master { | ||
| masters = append(masters, rip) | ||
| } | ||
| } | ||
|
|
||
| // If all nodes are unresponsive, return an error | ||
| if connectivityErrors == len(rips) { | ||
| return "", errors.New("all redis nodes are unresponsive") | ||
| } | ||
|
|
||
| if len(masters) != 1 { | ||
| return "", errors.New("number of redis nodes known as master is different than 1") | ||
| } | ||
|
|
@@ -357,17 +387,25 @@ func (r *RedisFailoverChecker) GetNumberMasters(rf *redisfailoverv1.RedisFailove | |
| return nMasters, err | ||
| } | ||
|
|
||
| connectivityErrors := 0 | ||
| rport := getRedisPort(rf.Spec.Redis.Port) | ||
| for _, rip := range rips { | ||
| master, err := r.redisClient.IsMaster(rip, rport, password) | ||
| if err != nil { | ||
| r.logger.Errorf("Get redis info failed, maybe this node is not ready, pod ip: %s", rip) | ||
| connectivityErrors++ | ||
| continue | ||
| } | ||
| if master { | ||
| nMasters++ | ||
| } | ||
| } | ||
|
|
||
| // Log connectivity status for debugging | ||
| if connectivityErrors > 0 { | ||
| r.logger.Warningf("Found %d masters out of %d responsive nodes (%d nodes had connectivity issues)", nMasters, len(rips)-connectivityErrors, connectivityErrors) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does this log warning even mean? |
||
| } | ||
|
|
||
| return nMasters, nil | ||
| } | ||
|
|
||
|
|
||
|
pratheep-kumar marked this conversation as resolved.
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These debug log messages could generate excessive noise in production. Consider using Debug level instead of Info for label update operations, or add a condition to only log when labels actually change.