Skip to content
Open
6 changes: 6 additions & 0 deletions api/v1/aerospikecluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,12 @@ type Rack struct { //nolint:govet // for readability
// Effective/operative PodSpec. The resultant is user input if specified else global PodSpec
// +optional
PodSpec RackPodSpec `json:"effectivePodSpec,omitempty"`

// ForceBlockFromRoster when set to true, removes all nodes from this rack from the roster
// If set to true, the operator will ignore unavailable/dead partitions and go ahead and remove the nodes from roster
// Hence, this should be used with caution.
// +optional
ForceBlockFromRoster *bool `json:"forceBlockFromRoster,omitempty"`
}

// ValidationPolicySpec controls validation of the Aerospike cluster resource.
Expand Down
5 changes: 5 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6816,6 +6816,12 @@ spec:
- name
x-kubernetes-list-type: map
type: object
forceBlockFromRoster:
description: |-
ForceBlockFromRoster when set to true, removes all nodes from this rack from the roster
If set to true, the operator will ignore unavailable/dead partitions and go ahead and remove the nodes from roster
Hence, this should be used with caution.
type: boolean
id:
description: Identifier for the rack
type: integer
Expand Down Expand Up @@ -16088,6 +16094,12 @@ spec:
- name
x-kubernetes-list-type: map
type: object
forceBlockFromRoster:
description: |-
ForceBlockFromRoster when set to true, removes all nodes from this rack from the roster
If set to true, the operator will ignore unavailable/dead partitions and go ahead and remove the nodes from roster
Hence, this should be used with caution.
type: boolean
id:
description: Identifier for the rack
type: integer
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.23.11
require (
github.com/aerospike/aerospike-backup-service/v3 v3.2.0
github.com/aerospike/aerospike-client-go/v8 v8.2.2
github.com/aerospike/aerospike-management-lib v1.8.1-0.20250829065008-0363517825bc
github.com/aerospike/aerospike-management-lib v1.8.1-0.20250912062927-f5201cbc4c08
github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d
github.com/deckarep/golang-set/v2 v2.8.0
github.com/evanphx/json-patch v4.12.0+incompatible
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ github.com/aerospike/aerospike-backup-service/v3 v3.2.0 h1:vqvT7OE1didb/R4vOTNWI
github.com/aerospike/aerospike-backup-service/v3 v3.2.0/go.mod h1:aSD1pSRjyMYQ1UW2Kkg/vinn+ngeobKzDIVj+UZIi2c=
github.com/aerospike/aerospike-client-go/v8 v8.2.2 h1:NV1GxB+ATUb1cQtwaIS731A/6EkwuAX4/heh8CpvQOI=
github.com/aerospike/aerospike-client-go/v8 v8.2.2/go.mod h1:H6CzKDoHxBj1yY/oQPci1bUIbEx2ATQtJ2GtZ+N64Wg=
github.com/aerospike/aerospike-management-lib v1.8.1-0.20250829065008-0363517825bc h1:/vlU9v4xNL/e0Lpa+OXixHLqZQedvsiK385H7rMWOlk=
github.com/aerospike/aerospike-management-lib v1.8.1-0.20250829065008-0363517825bc/go.mod h1:RNKlkSnppiaBaLEdAiCeBv68M+odPR5JR+ziO0NfaHQ=
github.com/aerospike/aerospike-management-lib v1.8.1-0.20250912062927-f5201cbc4c08 h1:AoY6nlBRt+FYnFo6RDu7BDGwOQ52HTI8+f5aW/ZHvKw=
github.com/aerospike/aerospike-management-lib v1.8.1-0.20250912062927-f5201cbc4c08/go.mod h1:RNKlkSnppiaBaLEdAiCeBv68M+odPR5JR+ziO0NfaHQ=
github.com/aerospike/backup-go v0.5.1 h1:8ryLV0nOoo3Zn7LRV1kQABFnYgg1V0r9ZJJOL22FBs0=
github.com/aerospike/backup-go v0.5.1/go.mod h1:+RJdOaVCG6jQlhoNIJrzQgZnJ/HEBPfAiPGdv3YYzyg=
github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI=
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6816,6 +6816,12 @@ spec:
- name
x-kubernetes-list-type: map
type: object
forceBlockFromRoster:
description: |-
ForceBlockFromRoster when set to true, removes all nodes from this rack from the roster
If set to true, the operator will ignore unavailable/dead partitions and go ahead and remove the nodes from roster
Hence, this should be used with caution.
type: boolean
id:
description: Identifier for the rack
type: integer
Expand Down Expand Up @@ -16088,6 +16094,12 @@ spec:
- name
x-kubernetes-list-type: map
type: object
forceBlockFromRoster:
description: |-
ForceBlockFromRoster when set to true, removes all nodes from this rack from the roster
If set to true, the operator will ignore unavailable/dead partitions and go ahead and remove the nodes from roster
Hence, this should be used with caution.
type: boolean
id:
description: Identifier for the rack
type: integer
Expand Down
50 changes: 50 additions & 0 deletions internal/controller/cluster/aero_info_calls.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ package cluster
import (
"context"
"fmt"
"strconv"
"strings"
"time"

corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -82,6 +84,54 @@ func (r *SingleClusterReconciler) waitForMultipleNodesSafeStopReady(
return common.ReconcileSuccess()
}

// getNodeIDsFromBlockedRacks returns node IDs from all racks that have ForceBlockFromRoster: true
func (r *SingleClusterReconciler) getNodeIDsFromBlockedRacks() []string {
// If no pods, return empty list (no nodes in roster)
if len(r.aeroCluster.Status.Pods) == 0 {
return nil
}

var nodeIDs []string

rackStateList := getConfiguredRackStateList(r.aeroCluster)
for _, rackState := range rackStateList {
if !asdbv1.GetBool(rackState.Rack.ForceBlockFromRoster) {
continue
}

rackID := rackState.Rack.ID
infix := getNodeIDInfixForRack(r.aeroCluster.Status.Pods, rackID)

for podOrdinal := int32(0); podOrdinal < rackState.Size; podOrdinal++ {
nodeID := fmt.Sprintf("%d%s%d", rackID, infix, podOrdinal)
nodeIDs = append(nodeIDs, nodeID)

r.Log.V(1).Info("Added rack node to comprehensive block list",
"rackID", rackID, "nodeID", nodeID)
}
}

return nodeIDs
}

func getNodeIDInfixForRack(pods map[string]asdbv1.AerospikePodStatus, rackID int) string {
rackIDStr := strconv.Itoa(rackID)

for idx := range pods {
nodeID := pods[idx].Aerospike.NodeID
// ensure nodeID is long enough to contain rackID + infix + ordinal
if len(nodeID) <= len(rackIDStr) {
continue
}

if strings.HasPrefix(nodeID, rackIDStr) {
return string(nodeID[len(rackIDStr)]) // middle character
}
}

return "A"
}

func (r *SingleClusterReconciler) quiescePods(
policy *as.ClientPolicy, allHostConns []*deployment.HostConn, pods []*corev1.Pod, ignorablePodNames sets.Set[string],
) error {
Expand Down
26 changes: 16 additions & 10 deletions internal/controller/cluster/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -905,13 +905,17 @@ func (r *SingleClusterReconciler) cleanupDanglingPodsRack(sts *appsv1.StatefulSe
// 1. From racksToDelete that are currently not running and can be ignored in stability checks.
// 2. Failed/pending pods from the configuredRacks identified using maxIgnorablePods field and
// can be ignored from stability checks.
func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack, configuredRacks []RackState) (
func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []int, configuredRacks []RackState) (
sets.Set[string], error,
) {
ignorablePodNames := sets.Set[string]{}
ignorableRacks := sets.Set[int]{}

for rackIdx := range racksToDelete {
rackPods, err := r.getRackPodList(racksToDelete[rackIdx].ID)
ignorableRacks.Insert(racksToDelete...)
ignorableRacks.Insert(getRacksToBeBlockedFromRoster(r.Log, configuredRacks)...)

for rackID := range ignorableRacks {
rackPods, err := r.getRackPodList(rackID)
if err != nil {
return nil, err
}
Expand All @@ -925,21 +929,22 @@ func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack,
}

for idx := range configuredRacks {
rack := &configuredRacks[idx]
rackState := &configuredRacks[idx]
if ignorableRacks.Has(rackState.Rack.ID) {
// Already handled above
continue
}

failedAllowed, _ := intstr.GetScaledValueFromIntOrPercent(
r.aeroCluster.Spec.RackConfig.MaxIgnorablePods, int(rack.Size), false,
r.aeroCluster.Spec.RackConfig.MaxIgnorablePods, int(rackState.Size), false,
)

podList, err := r.getRackPodList(rack.Rack.ID)
podList, err := r.getRackPodList(rackState.Rack.ID)
if err != nil {
return nil, err
}

var (
failedPod []string
pendingPod []string
)
var failedPod, pendingPod []string

for podIdx := range podList.Items {
pod := &podList.Items[podIdx]
Expand Down Expand Up @@ -970,6 +975,7 @@ func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack,

return ignorablePodNames, nil
}

func (r *SingleClusterReconciler) getPodsPVCList(
podNames []string, rackID int,
) ([]corev1.PersistentVolumeClaim, error) {
Expand Down
19 changes: 18 additions & 1 deletion internal/controller/cluster/rack.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ func (r *SingleClusterReconciler) reconcileRacks() common.ReconcileResult {
rackIDsToDelete = append(rackIDsToDelete, racksToDelete[idx].ID)
}

ignorablePodNames, err := r.getIgnorablePods(racksToDelete, rackStateList)
ignorablePodNames, err := r.getIgnorablePods(rackIDsToDelete, rackStateList)
if err != nil {
return common.ReconcileError(err)
}
Expand Down Expand Up @@ -276,6 +276,23 @@ func (r *SingleClusterReconciler) createEmptyRack(rackState *RackState) (
return found, common.ReconcileSuccess()
}

// getRacksToBeBlockedFromRoster identifies racks that should have their nodes blocked from roster
// and returns a slice of racks that have ForceBlockFromRoster: true
func getRacksToBeBlockedFromRoster(log logger, rackStateList []RackState) []int {
var racksToBlock []int

for _, rackState := range rackStateList {
// Check if this rack has ForceBlockFromRoster set to true
if asdbv1.GetBool(rackState.Rack.ForceBlockFromRoster) {
racksToBlock = append(racksToBlock, rackState.Rack.ID)
log.Info("Rack marked for roster blocking",
"rackID", rackState.Rack.ID)
}
}

return racksToBlock
}

func (r *SingleClusterReconciler) getRacksToDelete(rackStateList []RackState) (
[]asdbv1.Rack, error,
) {
Expand Down
9 changes: 8 additions & 1 deletion internal/controller/cluster/strong_consistency.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ func (r *SingleClusterReconciler) getAndSetRoster(
policy *as.ClientPolicy, rosterNodeBlockList []string,
ignorablePodNames sets.Set[string],
) error {
// Get node IDs from racks with ForceBlockFromRoster: true
rackBlockedNodeIDs := r.getNodeIDsFromBlockedRacks()

// Append rack-blocked node IDs to the user-specified block list
rosterNodeBlockList = append(rosterNodeBlockList, rackBlockedNodeIDs...)

allHostConns, err := r.newAllHostConnWithOption(ignorablePodNames)
if err != nil {
return err
Expand All @@ -23,7 +29,8 @@ func (r *SingleClusterReconciler) getAndSetRoster(
return err
}

return deployment.GetAndSetRoster(r.Log, allHostConns, policy, rosterNodeBlockList, ignorableNamespaces)
return deployment.ManageRoster(r.Log, allHostConns, policy, rosterNodeBlockList,
ignorableNamespaces, len(rackBlockedNodeIDs) > 0)
}

func (r *SingleClusterReconciler) validateSCClusterState(policy *as.ClientPolicy, ignorablePodNames sets.Set[string],
Expand Down
58 changes: 58 additions & 0 deletions internal/webhook/v1/aerospikecluster_validating_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,8 @@ func validateRackUpdate(
return nil
}

forceBlockFromRosterChanged := false

// Old racks cannot be updated
// Also need to exclude a default rack with default rack ID. No need to check here,
// user should not provide or update default rackID
Expand Down Expand Up @@ -510,11 +512,47 @@ func validateRackUpdate(
}
}

if oldRack.ForceBlockFromRoster != newRack.ForceBlockFromRoster {
forceBlockFromRosterChanged = true
}

break
}
}
}

if forceBlockFromRosterChanged && newObj.Status.AerospikeConfig == nil {
return fmt.Errorf("status is not updated yet, cannot change ForceBlockFromRoster in rack")
}

racksBlockedFromRosterInSpec := make(sets.Set[int])
racksBlockedFromRosterInStatus := make(sets.Set[int])

for idx := range newObj.Spec.RackConfig.Racks {
rack := newObj.Spec.RackConfig.Racks[idx]
if asdbv1.GetBool(rack.ForceBlockFromRoster) {
racksBlockedFromRosterInSpec.Insert(rack.ID)
}
}

for idx := range newObj.Status.RackConfig.Racks {
rack := newObj.Status.RackConfig.Racks[idx]
if asdbv1.GetBool(rack.ForceBlockFromRoster) {
racksBlockedFromRosterInStatus.Insert(rack.ID)
}
}

if len(newObj.Status.RackConfig.Racks)-len(racksBlockedFromRosterInSpec) == 1 &&
(newObj.Spec.RackConfig.RollingUpdateBatchSize != nil || newObj.Spec.RackConfig.ScaleDownBatchSize != nil) {
return fmt.Errorf("with only one rack remaining in roster, cannot use batch update or scale down")
}

desiredRacksBlockedFromRoster := racksBlockedFromRosterInSpec.Difference(racksBlockedFromRosterInStatus)

if len(desiredRacksBlockedFromRoster) > 1 {
return fmt.Errorf("can change only one rack at a time to ForceBlockFromRoster: true")
}

return nil
}

Expand Down Expand Up @@ -594,6 +632,8 @@ func validateRackConfig(_ logr.Logger, cluster *asdbv1.AerospikeCluster) error {
rackMap := map[int]bool{}
migrateFillDelaySet := sets.Set[int]{}

var racksBlockedFromRoster int

for idx := range cluster.Spec.RackConfig.Racks {
rack := &cluster.Spec.RackConfig.Racks[idx]
// Check for duplicate
Expand Down Expand Up @@ -640,6 +680,24 @@ func validateRackConfig(_ logr.Logger, cluster *asdbv1.AerospikeCluster) error {
}

migrateFillDelaySet.Insert(migrateFillDelay)

if asdbv1.GetBool(rack.ForceBlockFromRoster) {
racksBlockedFromRoster++
}
}

if racksBlockedFromRoster > 0 {
if cluster.Spec.RackConfig.MaxIgnorablePods != nil {
return fmt.Errorf("ForceBlockFromRoster: true racks cannot be used with MaxIgnorablePods set")
}

if len(cluster.Spec.RosterNodeBlockList) > 0 {
return fmt.Errorf("ForceBlockFromRoster: true racks cannot be used with RosterNodeBlockList")
}

if racksBlockedFromRoster == len(cluster.Spec.RackConfig.Racks) {
return fmt.Errorf("all racks cannot have ForceBlockFromRoster: true. At least one rack must remain in the roster")
}
}

// If len of migrateFillDelaySet is more than 1, it means that different migrate-fill-delay is set across racks
Expand Down
Loading