Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions operator/controllers/patroni_core_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,14 +196,6 @@ func (pr *PatroniCoreReconciler) Reconcile(ctx context.Context, request ctrl.Req
pr.logger.Info("Reconcile will be started...")
time.Sleep(30 * time.Second)

if err := credentials.ProcessCreds(pr.helper.GetOwnerReferences()); err != nil {
return pr.handleReconcileError(maxReconcileAttempts,
"CanNotActualizeCredsOnCluster",
newCrHash,
"Error during actualization of creds on cluster",
err)
}

if len(cr.RunTestsTime) > 0 {
pr.logger.Info("runTestsOnly : true")
if err := pr.createTestsPods(cr); err != nil {
Expand Down Expand Up @@ -274,6 +266,15 @@ func (pr *PatroniCoreReconciler) Reconcile(ctx context.Context, request ctrl.Req
return reconcile.Result{RequeueAfter: time.Minute}, err
}

// Process credentials after cluster is created
if err := credentials.ProcessCreds(pr.helper.GetOwnerReferences()); err != nil {
return pr.handleReconcileError(maxReconcileAttempts,
"CanNotActualizeCredsOnCluster",
newCrHash,
"Error during actualization of creds on cluster",
err)
}

if err := pr.helper.UpdatePatroniConfigMaps(); err != nil {
pr.logger.Error("error during update of patroni config maps", zap.Error(err))
// will not return err because there is a slight chance, that
Expand Down
134 changes: 134 additions & 0 deletions tests/robot/Lib/PlatformLibrary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# Minimal PlatformLibrary stub for local testing
# This wraps kubernetes-client to provide the interface expected by pgsLibrary.py

from kubernetes import client, config
from kubernetes.stream import stream
import logging

log = logging.getLogger(__name__)

class PlatformLibrary:
def __init__(self, managed_by_operator=None):
try:
# Try to load in-cluster config first
config.load_incluster_config()
except Exception:
# Fall back to kubeconfig
try:
config.load_kube_config()
except Exception as e:
log.warning(f"Could not load kubernetes config: {e}")

self.core_api = client.CoreV1Api()
self.apps_api = client.AppsV1Api()
self.managed_by_operator = managed_by_operator

def get_pods(self, namespace, **kwargs):
"""Get pods in a namespace"""
label_selector = kwargs.get('label_selector', '')
# Note: managed_by_operator is stored but not automatically applied as a filter
# The real PlatformLibrary likely handles this differently, or it's used elsewhere

pods = self.core_api.list_namespaced_pod(namespace, label_selector=label_selector if label_selector else None)
return pods.items

def execute_command_in_pod(self, pod_name, namespace, command):
"""Execute a command in a pod"""
try:
if isinstance(command, str):
command = ['/bin/sh', '-c', command]

resp = stream(self.core_api.connect_get_namespaced_pod_exec,
pod_name, namespace,
command=command,
stderr=True, stdin=False,
stdout=True, tty=False)
return resp, None
except Exception as e:
return None, str(e)

def get_config_map(self, name, namespace):
"""Get a ConfigMap"""
return self.core_api.read_namespaced_config_map(name, namespace)

def get_secret(self, name, namespace):
"""Get a Secret"""
return self.core_api.read_namespaced_secret(name, namespace)

def get_deployment_entity(self, name, namespace):
"""Get a Deployment"""
return self.apps_api.read_namespaced_deployment(name, namespace)

def get_deployment_entities(self, namespace):
"""Get all Deployments in a namespace"""
deployments = self.apps_api.list_namespaced_deployment(namespace)
return deployments.items

def get_replica_number(self, name, namespace):
"""Get replica count for a deployment"""
deployment = self.apps_api.read_namespaced_deployment(name, namespace)
return deployment.spec.replicas

def set_replicas_for_deployment_entity(self, name, namespace, replicas):
"""Set replica count for a deployment"""
body = {'spec': {'replicas': replicas}}
self.apps_api.patch_namespaced_deployment_scale(name, namespace, body)

def delete_pod_by_pod_name(self, pod_name, namespace, grace_period=0):
"""Delete a pod"""
self.core_api.delete_namespaced_pod(pod_name, namespace,
grace_period_seconds=grace_period)

def get_replica_set(self, name, namespace):
"""Get a ReplicaSet"""
return self.apps_api.read_namespaced_replica_set(name, namespace)

def get_stateful_set(self, name, namespace):
"""Get a StatefulSet"""
return self.apps_api.read_namespaced_stateful_set(name, namespace)

def scale_down_stateful_set(self, name, namespace):
"""Scale down a StatefulSet to 0"""
self.set_replicas_for_stateful_set(name, namespace, 0)

def set_replicas_for_stateful_set(self, name, namespace, replicas):
"""Set replica count for a StatefulSet"""
body = {'spec': {'replicas': replicas}}
self.apps_api.patch_namespaced_stateful_set_scale(name, namespace, body)

def check_service_of_stateful_sets_is_scaled(self, stateful_set_names, namespace,
direction='down', timeout=60):
"""Check if StatefulSets are scaled in a direction"""
# Simplified implementation
import time
start = time.time()
while time.time() - start < timeout:
all_scaled = True
for name in stateful_set_names:
ss = self.get_stateful_set(name, namespace)
if direction == 'down' and ss.spec.replicas > 0:
all_scaled = False
elif direction == 'up' and ss.spec.replicas == 0:
all_scaled = False
if all_scaled:
return True
time.sleep(2)
return False

def get_resource_image(self, resource_type, name, namespace, container_name=None):
"""Get container image for a resource"""
if resource_type.lower() == 'deployment':
resource = self.get_deployment_entity(name, namespace)
elif resource_type.lower() == 'statefulset':
resource = self.get_stateful_set(name, namespace)
else:
return None

containers = resource.spec.template.spec.containers
if container_name:
for container in containers:
if container.name == container_name:
return container.image
elif len(containers) > 0:
return containers[0].image
return None
2 changes: 1 addition & 1 deletion tests/robot/Lib/lib.robot
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ Insert Test Record
${res}= Execute Query ${MASTERHOST} select * from test_insert_robot where id=${RID} dbname=${database}
Should Be True """${EXPECTED}""" in """${res}""" msg=[insert test record] Expected string ${EXPECTED} not found on ${MASTERHOST} : res: ${res}
Log To Console Test records found on ${MASTERHOST}
[Return] ${RID} ${EXPECTED}
RETURN ${RID} ${EXPECTED}

Check Test Record
[Arguments] ${pod_name} ${RID} ${EXPECTED} ${database}=postgres
Expand Down
22 changes: 11 additions & 11 deletions tests/robot/Lib/pgsLibrary.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def setup_console_logging(self):
def setup_robot_logging(self):
try:
from robot.api import logger
except ImportError as e:
except ImportError:
pass
log = logging.getLogger()
log.setLevel(logging.INFO)
Expand All @@ -80,7 +80,7 @@ def emit(self, record):
logger.info(msg)
except (KeyboardInterrupt, SystemExit):
raise
except:
except Exception:
self.handleError(record)
log.addHandler(RobotRedirectHandler())

Expand Down Expand Up @@ -178,7 +178,7 @@ def execute_auth_check(self):
config_map_name = "patroni-{}.config.yaml".format(cluster_name)
try:
config_map = self.pl_lib.get_config_map(config_map_name, self._namespace)
except:
except Exception:
config_map_name = "{}-patroni.config.yaml".format(cluster_name)
config_map = self.pl_lib.get_config_map(config_map_name, self._namespace)
config_map_yaml = (config_map.to_dict())
Expand Down Expand Up @@ -207,8 +207,12 @@ def get_pods(self, **kwargs):
if (key == 'status'):
pods = list([x for x in pods if x.status.phase == value])
if (key == 'label'):
(k, v) = value.split(":")
pods = list([x for x in pods if k in x.metadata.labels and x.metadata.labels[k] == v])
# Support both ":" and "=" as separators
if ":" in value:
(k, v) = value.split(":", 1)
else:
(k, v) = value.split("=", 1)
pods = list([x for x in pods if x.metadata.labels and k in x.metadata.labels and x.metadata.labels[k] == v])
return pods

def get_pod(self, **kwargs):
Expand Down Expand Up @@ -344,10 +348,6 @@ def http_request(self, url):
logging.info("Error {0}. url: {1}".format(e, url))
return resp

def get_master_service(self):
master_service = "pg-" + os.getenv("PG_CLUSTER_NAME", "patroni")
return master_service

def make_switchover_via_patroni_rest(self):
logging.info("Manual switchover via Patroni REST is called")
master = self.get_master_pod_id()
Expand Down Expand Up @@ -375,7 +375,7 @@ def make_switchover_via_patroni_rest(self):
assert new_master == replica

def check_if_next_run_scheduled(self):
pod = self.get_pod(label='app:postgres-backup-daemon', status='Running')
self.get_pod(label='app:postgres-backup-daemon', status='Running')
schedule = requests.get(f"{self._scheme}://postgres-backup-daemon:8085/schedule", verify=False)
schedule_json = schedule['stdout']
if "time_until_next_backup" in schedule_json:
Expand Down Expand Up @@ -573,7 +573,7 @@ def schedule_backup(self):
health_json = requests.get(f"{self._scheme}://postgres-backup-daemon:8080/health", verify=False).json()
new_dump_count = int(health_json["storage"]["lastSuccessful"]["ts"])
delta = int(expr_date) - new_dump_count
except:
except Exception:
logging.exception("Cannot parse delta")
delta = 60000
if delta < 60000:
Expand Down
106 changes: 106 additions & 0 deletions tests/robot/check_installation/README_BOOTSTRAP_TEST.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Bootstrap Regression Test

## Purpose

This test validates the fix for: **"operator crashes during bootstrap because credentials.ProcessCreds() was called before reconcilePatroniCoreCluster()"**

## What It Tests

The `check_operator_bootstrap.robot` test ensures:

1. ✅ Operator starts successfully
2. ✅ Patroni cluster is created without operator crashes
3. ✅ Credentials are processed **after** cluster exists (not before)
4. ✅ PostgreSQL StatefulSets are created
5. ✅ PostgreSQL pods come up successfully
6. ✅ No nil pointer dereference or panic errors in operator logs
7. ✅ No "context deadline exceeded" errors during bootstrap
8. ✅ Replication works

## How to Run

### Option 1: Run via Docker (Recommended)

```bash
# From repository root
cd tests

# Build test image
docker build -t pgskipper-operator-tests:local .

# Run the bootstrap test
docker run --rm \
-e POD_NAMESPACE=postgres \
-e PG_CLUSTER_NAME=patroni \
-e PG_NODE_QTY=2 \
-e KUBECONFIG=/config/kubeconfig \
-v ~/.kube/config:/config/kubeconfig \
pgskipper-operator-tests:local \
robot -i check_operator_bootstrap /test_runs/check_installation/
```

### Option 2: Run with Robot Framework directly

```bash
# Install Robot Framework
pip install robotframework robotframework-requests kubernetes

# Set environment variables
export POD_NAMESPACE=postgres
export PG_CLUSTER_NAME=patroni
export PG_NODE_QTY=2

# Run test
cd tests/robot
robot -i check_operator_bootstrap check_installation/check_operator_bootstrap.robot
```

## Expected Results

### ✅ Success

```
==============================================================================
Check Installation :: Check operator doesn't crash during cluster bootstrap
==============================================================================
Check Operator Bootstrap Without Crash | PASS |
------------------------------------------------------------------------------
Check Installation :: Check operator doesn't crash during clust... | PASS |
1 test, 1 passed, 0 failed
```

**Operator Logs**: No errors related to:
- `context deadline exceeded`
- `nil pointer dereference`
- `Error during actualization of creds on cluster`
- `panic`

### ❌ Failure (Old Bug)

If the fix is reverted, you would see:

```
Check Operator Bootstrap Without Crash | FAIL |
Operator logs contain: "Error during actualization of creds on cluster"
```

**Operator Logs** would contain:
```
ERROR: Error during actualization of creds on cluster
panic: runtime error: invalid memory address or nil pointer dereference
```

## Related Files

- **Fix**: `operator/controllers/patroni_core_controller.go:270`
- **Original Bug**: ProcessCreds was at line 202 (before cluster creation)
- **Current Fix**: ProcessCreds moved to line 270 (after cluster creation)

## Maintenance

If the code structure changes:

1. Update line numbers in test documentation
2. Verify error messages still match
3. Update log assertions if error format changes
4. Keep test tags up to date
Loading