Skip to content
Open
7 changes: 7 additions & 0 deletions api/v1/aerospikecluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,13 @@ type Rack struct { //nolint:govet // for readability
// Effective/operative PodSpec. The resultant is user input if specified else global PodSpec
// +optional
PodSpec RackPodSpec `json:"effectivePodSpec,omitempty"`

// ForceBlockFromRoster, when set to true, forcefully removes all nodes in this rack from the roster.
// With this enabled, the operator ignores unavailable or dead partitions and proceeds with node removal.
// Additionally, any failed or pending pods in this rack are also ignored during this process.
// It may result in data loss. Use this with caution.
// +optional
ForceBlockFromRoster *bool `json:"forceBlockFromRoster,omitempty"`
}

// ValidationPolicySpec controls validation of the Aerospike cluster resource.
Expand Down
5 changes: 5 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6812,6 +6812,13 @@ spec:
- name
x-kubernetes-list-type: map
type: object
forceBlockFromRoster:
description: |-
ForceBlockFromRoster, when set to true, forcefully removes all nodes in this rack from the roster.
With this enabled, the operator ignores unavailable or dead partitions and proceeds with node removal.
Additionally, any failed or pending pods in this rack are also ignored during this process.
It may result in data loss. Use this with caution.
type: boolean
id:
description: Identifier for the rack
type: integer
Expand Down Expand Up @@ -16079,6 +16086,13 @@ spec:
- name
x-kubernetes-list-type: map
type: object
forceBlockFromRoster:
description: |-
ForceBlockFromRoster, when set to true, forcefully removes all nodes in this rack from the roster.
With this enabled, the operator ignores unavailable or dead partitions and proceeds with node removal.
Additionally, any failed or pending pods in this rack are also ignored during this process.
It may result in data loss. Use this with caution.
type: boolean
id:
description: Identifier for the rack
type: integer
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.24.10
require (
github.com/aerospike/aerospike-backup-service/v3 v3.4.1-0.20251113113955-5523ec8870b1
github.com/aerospike/aerospike-client-go/v8 v8.4.1
github.com/aerospike/aerospike-management-lib v1.8.1-0.20251114122505-c5d5241d8594
github.com/aerospike/aerospike-management-lib v1.8.1-0.20251125140123-6c83f948aaf8
github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d
github.com/deckarep/golang-set/v2 v2.8.0
github.com/evanphx/json-patch v4.12.0+incompatible
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ github.com/aerospike/aerospike-backup-service/v3 v3.4.1-0.20251113113955-5523ec8
github.com/aerospike/aerospike-backup-service/v3 v3.4.1-0.20251113113955-5523ec8870b1/go.mod h1:O96iFIRIgxZsp2J0LW/aT68Cxro+SkyRBmiYDgvJOOI=
github.com/aerospike/aerospike-client-go/v8 v8.4.1 h1:WhkI3JxWf/irsDh0lcLpwdwSqbRiU8DWTtb8UuZz1Ew=
github.com/aerospike/aerospike-client-go/v8 v8.4.1/go.mod h1:t1LXZ3QVi4B4lR9qEkyei1eMbuohBJBGs1YY5b5fYEI=
github.com/aerospike/aerospike-management-lib v1.8.1-0.20251114122505-c5d5241d8594 h1:Mp4N7J3Zx+kinwQm4vUu7KoJ1+Ygq2p9eNilEtW5tug=
github.com/aerospike/aerospike-management-lib v1.8.1-0.20251114122505-c5d5241d8594/go.mod h1:91WXtmHojERsKw1o/L/sehgUEYuqOCb58zi37vaIyH4=
github.com/aerospike/aerospike-management-lib v1.8.1-0.20251125140123-6c83f948aaf8 h1:afnIRbE44tZzqKM1S7lWDM3+/n8/lWixuHD3yL7fxLM=
github.com/aerospike/aerospike-management-lib v1.8.1-0.20251125140123-6c83f948aaf8/go.mod h1:91WXtmHojERsKw1o/L/sehgUEYuqOCb58zi37vaIyH4=
github.com/aerospike/backup-go v0.8.0 h1:9jR2YbG5joqDmoqqnwo/AbqrytJ2RCI0+TQYm2nMDAw=
github.com/aerospike/backup-go v0.8.0/go.mod h1:vmB+qTHv10pKIM6jtSHY7kLkD79lydkzIQP9znG1j6U=
github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI=
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6812,6 +6812,13 @@ spec:
- name
x-kubernetes-list-type: map
type: object
forceBlockFromRoster:
description: |-
ForceBlockFromRoster, when set to true, forcefully removes all nodes in this rack from the roster.
With this enabled, the operator ignores unavailable or dead partitions and proceeds with node removal.
Additionally, any failed or pending pods in this rack are also ignored during this process.
It may result in data loss. Use this with caution.
type: boolean
id:
description: Identifier for the rack
type: integer
Expand Down Expand Up @@ -16079,6 +16086,13 @@ spec:
- name
x-kubernetes-list-type: map
type: object
forceBlockFromRoster:
description: |-
ForceBlockFromRoster, when set to true, forcefully removes all nodes in this rack from the roster.
With this enabled, the operator ignores unavailable or dead partitions and proceeds with node removal.
Additionally, any failed or pending pods in this rack are also ignored during this process.
It may result in data loss. Use this with caution.
type: boolean
id:
description: Identifier for the rack
type: integer
Expand Down
26 changes: 17 additions & 9 deletions internal/controller/cluster/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -946,12 +946,16 @@ func (r *SingleClusterReconciler) getIgnorablePods(
racksToDelete []asdbv1.Rack, configuredRacks []RackState, revisionChangedRacks map[int]revisionChangedRack,
) (sets.Set[string], error) {
ignorablePodNames := sets.Set[string]{}
ignorableRackIDs := sets.Set[int]{}

for rackIdx := range racksToDelete {
ignorableRacks := append([]asdbv1.Rack{}, racksToDelete...)
ignorableRacks = append(ignorableRacks, getRacksToBeBlockedFromRoster(r.Log, configuredRacks)...)

for rackIdx := range ignorableRacks {
r.Log.Info("Rack to delete found", "rackID",
racksToDelete[rackIdx].ID, "rackRevision", racksToDelete[rackIdx].Revision)

rackPods, err := r.getRackPodList(racksToDelete[rackIdx].ID, racksToDelete[rackIdx].Revision)
rackPods, err := r.getRackPodList(ignorableRacks[rackIdx].ID, racksToDelete[rackIdx].Revision)
if err != nil {
return nil, err
}
Expand All @@ -962,6 +966,8 @@ func (r *SingleClusterReconciler) getIgnorablePods(
ignorablePodNames.Insert(pod.Name)
}
}

ignorableRackIDs.Insert(ignorableRacks[rackIdx].ID)
}

// Handle failed pods from old revisions of revision-changed racks
Expand Down Expand Up @@ -993,21 +999,22 @@ func (r *SingleClusterReconciler) getIgnorablePods(
}

for idx := range configuredRacks {
rack := &configuredRacks[idx]
rackState := &configuredRacks[idx]
if ignorableRackIDs.Has(rackState.Rack.ID) {
// Already handled above
continue
}

failedAllowed, _ := intstr.GetScaledValueFromIntOrPercent(
r.aeroCluster.Spec.RackConfig.MaxIgnorablePods, int(rack.Size), false,
r.aeroCluster.Spec.RackConfig.MaxIgnorablePods, int(rackState.Size), false,
)

podList, err := r.getRackPodList(rack.Rack.ID, rack.Rack.Revision)
podList, err := r.getRackPodList(rackState.Rack.ID, rackState.Rack.Revision)
if err != nil {
return nil, err
}

var (
failedPod []string
pendingPod []string
)
var failedPod, pendingPod []string

for podIdx := range podList.Items {
pod := &podList.Items[podIdx]
Expand Down Expand Up @@ -1038,6 +1045,7 @@ func (r *SingleClusterReconciler) getIgnorablePods(

return ignorablePodNames, nil
}

func (r *SingleClusterReconciler) getPodsPVCList(
podNames []string, rackID int, rackRevision string,
) ([]corev1.PersistentVolumeClaim, error) {
Expand Down
17 changes: 17 additions & 0 deletions internal/controller/cluster/rack.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,23 @@ func (r *SingleClusterReconciler) createEmptyRack(rackState *RackState) (
return found, common.ReconcileSuccess()
}

// getRacksToBeBlockedFromRoster identifies racks that should have their nodes blocked from roster
// and returns a slice of racks that have ForceBlockFromRoster: true
func getRacksToBeBlockedFromRoster(log logger, rackStateList []RackState) []asdbv1.Rack {
var racksToBlock []asdbv1.Rack

for _, rackState := range rackStateList {
// Check if this rack has ForceBlockFromRoster set to true
if asdbv1.GetBool(rackState.Rack.ForceBlockFromRoster) {
racksToBlock = append(racksToBlock, *rackState.Rack)
log.Info("Rack marked for roster blocking",
"rackID", rackState.Rack.ID)
}
}

return racksToBlock
}

func (r *SingleClusterReconciler) getRacksToDelete(rackStateList []RackState) (
[]asdbv1.Rack, error,
) {
Expand Down
13 changes: 12 additions & 1 deletion internal/controller/cluster/strong_consistency.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package cluster

import (
"strconv"

gosets "github.com/deckarep/golang-set/v2"
"k8s.io/apimachinery/pkg/util/sets"

Expand All @@ -13,6 +15,14 @@ func (r *SingleClusterReconciler) getAndSetRoster(
policy *as.ClientPolicy, rosterNodeBlockList []string,
ignorablePodNames sets.Set[string],
) error {
rackStateList := getConfiguredRackStateList(r.aeroCluster)
blockedRackIDs := gosets.NewSet[string]()

blockedRacks := getRacksToBeBlockedFromRoster(r.Log, rackStateList)
for idx := range blockedRacks {
blockedRackIDs.Add(strconv.Itoa(blockedRacks[idx].ID))
}

allHostConns, err := r.newAllHostConnWithOption(ignorablePodNames)
if err != nil {
return err
Expand All @@ -23,7 +33,8 @@ func (r *SingleClusterReconciler) getAndSetRoster(
return err
}

return deployment.GetAndSetRoster(r.Log, allHostConns, policy, rosterNodeBlockList, ignorableNamespaces)
return deployment.ManageRoster(r.Log, allHostConns, policy, rosterNodeBlockList,
ignorableNamespaces, blockedRackIDs)
}

func (r *SingleClusterReconciler) validateSCClusterState(policy *as.ClientPolicy, ignorablePodNames sets.Set[string],
Expand Down
83 changes: 83 additions & 0 deletions internal/webhook/v1/aerospikecluster_validating_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,8 @@ func validateRackUpdate(
return nil
}

forceBlockFromRosterChanged := false

// Need to exclude a default rack with default rack ID. No need to check here,
// user should not provide or update default rackID
// Also when user add new rackIDs old default will be removed by reconciler.
Expand Down Expand Up @@ -595,11 +597,50 @@ func validateRackUpdate(
}
}

if oldRack.ForceBlockFromRoster != newRack.ForceBlockFromRoster {
forceBlockFromRosterChanged = true
}

break
}
}
}

return validateForceBlockFromRosterUpdate(forceBlockFromRosterChanged, newObj)
}

func validateForceBlockFromRosterUpdate(forceBlockFromRosterChanged bool, newObj *asdbv1.AerospikeCluster) error {
if forceBlockFromRosterChanged && newObj.Status.AerospikeConfig == nil {
return fmt.Errorf("status is not updated yet, cannot change forceBlockFromRoster in rack")
}

racksBlockedFromRosterInSpec := sets.New[int]()
racksBlockedFromRosterInStatus := sets.New[int]()

for idx := range newObj.Spec.RackConfig.Racks {
rack := newObj.Spec.RackConfig.Racks[idx]
if asdbv1.GetBool(rack.ForceBlockFromRoster) {
racksBlockedFromRosterInSpec.Insert(rack.ID)
}
}

for idx := range newObj.Status.RackConfig.Racks {
rack := newObj.Status.RackConfig.Racks[idx]
if asdbv1.GetBool(rack.ForceBlockFromRoster) {
racksBlockedFromRosterInStatus.Insert(rack.ID)
}
}

remainingRacks := len(newObj.Status.RackConfig.Racks) - len(racksBlockedFromRosterInSpec)
if err := validateRackCountConstraints(remainingRacks, &newObj.Spec.RackConfig); err != nil {
return err
}

desiredRacksBlockedFromRoster := racksBlockedFromRosterInSpec.Difference(racksBlockedFromRosterInStatus)
if len(desiredRacksBlockedFromRoster) > 1 {
return fmt.Errorf("the forceBlockFromRoster flag can be applied to only one rack at a time")
}

return nil
}

Expand Down Expand Up @@ -713,6 +754,8 @@ func validateRackConfig(_ logr.Logger, cluster *asdbv1.AerospikeCluster) error {
rackMap := map[int]bool{}
migrateFillDelaySet := sets.Set[int]{}

var racksBlockedFromRoster int

for idx := range cluster.Spec.RackConfig.Racks {
rack := &cluster.Spec.RackConfig.Racks[idx]
// Check for duplicate
Expand Down Expand Up @@ -759,6 +802,14 @@ func validateRackConfig(_ logr.Logger, cluster *asdbv1.AerospikeCluster) error {
}

migrateFillDelaySet.Insert(migrateFillDelay)

if asdbv1.GetBool(rack.ForceBlockFromRoster) {
racksBlockedFromRoster++
}
}

if err := validateRackBlockedFromRoster(racksBlockedFromRoster, cluster); err != nil {
return err
}

// If len of migrateFillDelaySet is more than 1, it means that different migrate-fill-delay is set across racks
Expand Down Expand Up @@ -787,6 +838,38 @@ func validateRackConfig(_ logr.Logger, cluster *asdbv1.AerospikeCluster) error {
return nil
}

func validateRackBlockedFromRoster(racksBlockedFromRoster int, cluster *asdbv1.AerospikeCluster) error {
if racksBlockedFromRoster > 0 {
if cluster.Spec.RackConfig.MaxIgnorablePods != nil {
return fmt.Errorf("forceBlockFromRoster cannot be used together with maxIgnorablePods")
}

if len(cluster.Spec.RosterNodeBlockList) > 0 {
return fmt.Errorf("forceBlockFromRoster cannot be used together with RosterNodeBlockList")
}

remainingRacks := len(cluster.Spec.RackConfig.Racks) - racksBlockedFromRoster
if err := validateRackCountConstraints(remainingRacks, &cluster.Spec.RackConfig); err != nil {
return err
}
}

return nil
}

func validateRackCountConstraints(remainingRacks int, rackConfig *asdbv1.RackConfig) error {
if remainingRacks <= 0 {
return fmt.Errorf("all racks cannot have forceBlockFromRoster enabled. At least one rack must remain in the roster")
}

if remainingRacks == 1 &&
(rackConfig.RollingUpdateBatchSize != nil || rackConfig.ScaleDownBatchSize != nil) {
return fmt.Errorf("with only one rack in roster, cannot use rollingUpdateBatchSize or scaleDownBatchSize")
}

return nil
}

type nsConf struct {
noOfRacksForNamespaces int
replicationFactor int
Expand Down
Loading