aerospike
diff --git a/‎.github/workflows/README.md‎
Lines changed: 52 additions & 0 deletions b/‎.github/workflows/README.md‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎.github/workflows/pkg-unit-tests.yaml‎
Lines changed: 36 additions & 0 deletions b/‎.github/workflows/pkg-unit-tests.yaml‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 12 additions & 0 deletions b/‎Makefile‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎TESTING.md‎
Lines changed: 123 additions & 0 deletions b/‎TESTING.md‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎api/v1/utils.go‎
Lines changed: 3 additions & 0 deletions b/‎api/v1/utils.go‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎config/manager/manager.yaml‎
Lines changed: 3 additions & 0 deletions b/‎config/manager/manager.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎helm-charts/aerospike-kubernetes-operator/templates/aerospike-operator-controller-manager-deployment.yaml‎
Lines changed: 2 additions & 0 deletions b/‎helm-charts/aerospike-kubernetes-operator/templates/aerospike-operator-controller-manager-deployment.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎helm-charts/aerospike-kubernetes-operator/values.yaml‎
Lines changed: 3 additions & 0 deletions b/‎helm-charts/aerospike-kubernetes-operator/values.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎internal/controller/cluster/pod.go‎
Lines changed: 35 additions & 10 deletions b/‎internal/controller/cluster/pod.go‎
Lines changed: 35 additions & 10 deletions
@@ -0,0 +1,52 @@
+# GitHub Actions Workflows
+
+This directory contains GitHub Actions workflows for CI/CD automation.
+
+## Available Workflows
+
+### 1. Unit tests (`pkg-unit-tests.yaml`)
+**Unit testing workflow for pkg directory**
+
+**Triggers:**
+- Push to `master` branch
+- Pull requests to `master` branch
+- **Only when files in `pkg/` directory change**
+
+**Features:**
+- ✅ Runs all unit tests in `pkg/` directory with race detection
+- ✅ Basic coverage reporting
+- ✅ Fast execution for quick feedback
+- ✅ Minimal configuration
+
+### 2. GolangCI Lint (`golangci-lint.yaml`)
+**Code quality and linting checks**
+
+**Triggers:**
+- Push to `master` branch or version tags
+- Pull requests to `master` branch
+
+**Features:**
+- ✅ Runs golangci-lint with comprehensive checks
+- ✅ 5-minute timeout for large codebases
+
+### 4. CodeQL Analysis (`codeql-analysis.yml`)
+**Security and code quality analysis**
+
+**Triggers:**
+- Scheduled runs and code changes
+
+**Features:**
+- ✅ Security vulnerability scanning
+- ✅ Code quality analysis
+
+### 5. Docker Image Release (`docker-image-release.yaml`)
+**Container image building and publishing**
+
+**Triggers:**
+- Version tag pushes
+
+**Features:**
+- ✅ Multi-architecture Docker image builds
+- ✅ Image publishing to container registry
+
+
@@ -0,0 +1,36 @@
+name: unit tests
+
+on:
+  push:
+    branches:
+      - master
+    paths:
+      - 'pkg/**'
+      - '.github/workflows/pkg-unit-tests.yaml'
+  pull_request:
+    branches:
+      - master
+    paths:
+      - 'pkg/**'
+      - '.github/workflows/pkg-unit-tests.yaml'
+
+jobs:
+  pkg-unit-tests:
+    name: pkg
+    runs-on: ubuntu-latest
+    
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+          cache: true
+      
+      - name: Run pkg unit tests
+        run: make pkg-test
+
@@ -141,6 +141,18 @@ go-lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
 .PHONY: all-test
 all-test: manifests generate fmt vet setup-envtest cluster-test backup-service-test backup-test restore-test ## Run tests.
 
+.PHONY: pkg-test
+pkg-test: ## Run unit tests for pkg directory
+	@echo "Running pkg unit tests..."
+	go test -v -race -coverprofile=coverage.out ./pkg/...
+	@echo "\nCoverage Summary:"
+	@go tool cover -func=coverage.out | tail -1
+
+.PHONY: pkg-test-coverage
+pkg-test-coverage: pkg-test ## Run pkg unit tests and open coverage report in browser
+	@echo "Opening coverage report in browser..."
+	go tool cover -html=coverage.out
+
 .PHONY: cluster-test
 cluster-test: manifests generate fmt vet setup-envtest ## Run tests.
 	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" cd $(shell pwd)/test/cluster; mkdir -p ../test-results; go run github.com/onsi/ginkgo/v2/ginkgo --grace-period=10m -p --procs=8 -coverprofile ascover.out -v -show-node-events --focus="$(FOCUS)" -timeout=5h0m0s --junit-report=../test-results/junit-cluster.xml  -- ${ARGS}
 
@@ -0,0 +1,123 @@
+# Testing Guide
+
+This document describes how to run tests for the Aerospike Kubernetes Operator.
+
+## Quick Reference
+
+| Command                           | Description                              |
+|-----------------------------------|------------------------------------------|
+| `make pkg-test`                   | Run pkg unit tests (CI command)          |
+| `make pkg-test-coverage`          | Run tests + open coverage report         |
+| `make cluster-test`               | Run cluster integration tests            |
+| `make backup-service-test`        | Run backup-service integration tests     |
+| `make backup-test`                | Run backup integration tests             |
+| `make restore-test`               | Run restore integration tests            |
+| `make all-test`                   | Run all tests                            |
+| `go test ./pkg/...`               | Run all pkg tests manually               |
+| `go test -v -race ./pkg/utils`    | Run specific package with race detection |
+| `go test -run TestName ./pkg/...` | Run specific test                        |
+
+## Quick Start
+
+### Run PKG Unit Tests (Recommended)
+
+```bash
+# Run pkg unit tests (same command used in CI)
+make pkg-test
+
+# Run pkg unit tests and open coverage report in browser
+make pkg-test-coverage
+```
+
+## Available Test Targets
+
+### Unit Tests
+
+#### `make pkg-test`
+Runs all unit tests in the `pkg/` directory with race detection and coverage reporting.
+
+**What it does:**
+- Runs `go test -v -race -coverprofile=coverage.out ./pkg/...`
+- Displays coverage summary at the end
+- Same command used by GitHub Actions CI
+
+**Example output:**
+```
+Running pkg unit tests...
+=== RUN   TestGetFailedPodGracePeriod
+--- PASS: TestGetFailedPodGracePeriod (0.00s)
+...
+PASS
+coverage: 29.8% of statements
+
+Coverage Summary:
+total: (statements) 18.6%
+```
+
+#### `make pkg-test-coverage`
+Runs pkg unit tests and opens an HTML coverage report in your browser.
+
+**Use this when:**
+- You want to see detailed line-by-line coverage
+- You're improving test coverage
+- You need to identify untested code paths
+
+### Integration Tests
+
+#### `make cluster-test`
+Runs cluster integration tests using Ginkgo.
+
+#### `make backup-service--test`
+Runs backup-service integration tests.
+
+#### `make backup-test`
+Runs backup integration tests.
+
+#### `make restore-test`
+Runs restore integration tests.
+
+#### `make all-test`
+Runs all tests (unit + integration).
+
+## Running Tests Manually
+
+### Run all pkg tests:
+```bash
+go test -v -race ./pkg/...
+```
+
+### Run specific package:
+```bash
+go test -v -race ./pkg/utils
+go test -v -race ./pkg/jsonpatch
+go test -v -race ./pkg/merge
+```
+
+### Run specific test:
+```bash
+go test -v -race ./pkg/utils -run TestGetFailedPodGracePeriod
+```
+
+### Run with coverage:
+```bash
+go test -v -race -coverprofile=coverage.out ./pkg/...
+go tool cover -func=coverage.out
+go tool cover -html=coverage.out
+```
+
+## Test Coverage
+
+### Current Coverage
+
+Run `make pkg-test` to see current coverage:
+```
+pkg/jsonpatch: 64.1% of statements
+pkg/merge:     89.3% of statements
+pkg/utils:     29.8% of statements
+total:         18.6% of statements
+```
+
+## Additional Resources
+- [Go Testing Documentation](https://golang.org/pkg/testing/)
+- [GitHub Actions Workflows](.github/workflows/README.md)
+
@@ -35,6 +35,9 @@ const (
 	AdminPortName    = "admin"
 
 	InfoPortName = "info"
+
+	DefaultFailedPodGracePeriodSeconds = 60
+	RequeueIntervalSeconds10           = 10
 )
 
 const (
 
@@ -60,6 +60,9 @@ spec:
         - name: WATCH_NAMESPACE
           # for watching multiple namespaces by operator, give a list of namespaces (e.g. aerospike,test,test1,test2)
           value: aerospike
+        - name: FAILED_POD_GRACE_PERIOD_SECONDS
+            # for setting the grace period to delete/recover failed pods, default is 60 seconds
+          value: 60
         - name: AEROSPIKE_KUBERNETES_INIT_REGISTRY
           # this is the registry used to pull aerospike-init image
           value: docker.io
 
@@ -61,6 +61,8 @@ spec:
         env:
         - name: WATCH_NAMESPACE
           value: {{ .Values.watchNamespaces | quote }}
+        - name: FAILED_POD_GRACE_PERIOD_SECONDS
+          value: {{ .Values.failedPodGracePeriodSeconds | quote }}
         - name: AEROSPIKE_KUBERNETES_INIT_REGISTRY
           value: {{ .Values.aerospikeKubernetesInitRegistry }}
         - name: AEROSPIKE_KUBERNETES_INIT_REGISTRY_NAMESPACE
 
@@ -34,6 +34,9 @@ certs:
 ##  Operator configurations
 watchNamespaces: "default,aerospike"
 
+# Grace period to delete/recover failed pods (in seconds)
+failedPodGracePeriodSeconds: "60"
+
 # Registry used to pull aerospike-init image
 aerospikeKubernetesInitRegistry: "docker.io"
 
 
@@ -261,7 +261,7 @@ func (r *SingleClusterReconciler) rollingRestartPods(
 	rackState *RackState, podsToRestart []*corev1.Pod, ignorablePodNames sets.Set[string],
 	restartTypeMap map[string]RestartType,
 ) common.ReconcileResult {
-	failedPods, activePods := getFailedAndActivePods(podsToRestart)
+	failedPods, failedWithinGracePeriodPods, activePods := getFailedAndActivePods(podsToRestart, true)
 
 	// If already dead node (failed pod) then no need to check node safety, migration
 	if len(failedPods) != 0 {
@@ -317,6 +317,15 @@ func (r *SingleClusterReconciler) rollingRestartPods(
 		}
 	}
 
+	if len(failedWithinGracePeriodPods) != 0 {
+		r.Log.Info(
+			"Pods are in failed state but within grace period, will not delete",
+			"pods", getPodNames(failedWithinGracePeriodPods),
+		)
+
+		return common.ReconcileRequeueAfter(asdbv1.RequeueIntervalSeconds10)
+	}
+
 	return common.ReconcileSuccess()
 }
 
@@ -548,22 +557,28 @@ func (r *SingleClusterReconciler) ensurePodsRunningAndReady(podsToCheck []*corev
 		podNames,
 	)
 
-	return common.ReconcileRequeueAfter(10)
+	return common.ReconcileRequeueAfter(asdbv1.RequeueIntervalSeconds10)
 }
 
-func getFailedAndActivePods(pods []*corev1.Pod) (failedPods, activePods []*corev1.Pod) {
+func getFailedAndActivePods(
+	pods []*corev1.Pod, withGracePeriod bool) (failedPods, failedWithinGracePeriodPods, activePods []*corev1.Pod,
+) {
 	for idx := range pods {
 		pod := pods[idx]
 
-		if err := utils.CheckPodFailed(pod); err != nil {
+		podState := utils.CheckPodFailedWithGrace(pod, withGracePeriod)
+
+		switch podState.State {
+		case utils.PodHealthy:
+			activePods = append(activePods, pod)
+		case utils.PodFailedInGrace:
+			failedWithinGracePeriodPods = append(failedWithinGracePeriodPods, pod)
+		case utils.PodFailed:
 			failedPods = append(failedPods, pod)
-			continue
 		}
-
-		activePods = append(activePods, pod)
 	}
 
-	return failedPods, activePods
+	return failedPods, failedWithinGracePeriodPods, activePods
 }
 
 func getNonIgnorablePods(pods []*corev1.Pod, ignorablePodNames sets.Set[string],
@@ -585,7 +600,7 @@ func getNonIgnorablePods(pods []*corev1.Pod, ignorablePodNames sets.Set[string],
 func (r *SingleClusterReconciler) safelyDeletePodsAndEnsureImageUpdated(
 	rackState *RackState, podsToUpdate []*corev1.Pod, ignorablePodNames sets.Set[string],
 ) common.ReconcileResult {
-	failedPods, activePods := getFailedAndActivePods(podsToUpdate)
+	failedPods, failedWithinGracePeriodPods, activePods := getFailedAndActivePods(podsToUpdate, true)
 
 	// If already dead node (failed pod) then no need to check node safety, migration
 	if len(failedPods) != 0 {
@@ -640,6 +655,15 @@ func (r *SingleClusterReconciler) safelyDeletePodsAndEnsureImageUpdated(
 		}
 	}
 
+	if len(failedWithinGracePeriodPods) != 0 {
+		r.Log.Info(
+			"Pods are in failed state but within grace period, will not delete",
+			"pods", getPodNames(failedWithinGracePeriodPods),
+		)
+
+		return common.ReconcileRequeueAfter(asdbv1.RequeueIntervalSeconds10)
+	}
+
 	return common.ReconcileSuccess()
 }
 
@@ -720,6 +744,7 @@ func (r *SingleClusterReconciler) ensurePodsImageUpdated(podsToCheck []*corev1.P
 				return common.ReconcileError(err)
 			}
 
+			// For existing cluster operations, no grace period for immediate responsiveness
 			if err := utils.CheckPodFailed(updatedPod); err != nil {
 				return common.ReconcileError(err)
 			}
@@ -746,7 +771,7 @@ func (r *SingleClusterReconciler) ensurePodsImageUpdated(podsToCheck []*corev1.P
 		podNames,
 	)
 
-	return common.ReconcileRequeueAfter(10)
+	return common.ReconcileRequeueAfter(asdbv1.RequeueIntervalSeconds10)
 }
 
 // cleanupPods checks pods and status before scale-up to detect and fix any
Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,9 @@ const (`
`35`	`35`	`AdminPortName = "admin"`
`36`	`36`
`37`	`37`	`InfoPortName = "info"`
	`38`	`+`
	`39`	`+ DefaultFailedPodGracePeriodSeconds = 60`
	`40`	`+ RequeueIntervalSeconds10 = 10`
`38`	`41`	`)`
`39`	`42`
`40`	`43`	`const (`