From 25575b93f48c93b2878703b21f41b730b397488a Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Wed, 26 Mar 2025 15:35:58 -0300 Subject: [PATCH 01/11] Add check for snap revision mismatch Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/src/charm.py b/src/charm.py index 30ca8117a9..0e2b0b74b8 100755 --- a/src/charm.py +++ b/src/charm.py @@ -114,6 +114,7 @@ PRIMARY_NOT_REACHABLE_MESSAGE = "waiting for primary to be reachable from this unit" EXTENSIONS_DEPENDENCY_MESSAGE = "Unsatisfied plugin dependencies. Please check the logs" EXTENSION_OBJECT_MESSAGE = "Cannot disable plugins: Existing objects depend on it. See logs" +SNAP_REVISIONS_MISMATCH_MESSAGE = "Workloads revisions are not the expected ones" Scopes = Literal[APP_SCOPE, UNIT_SCOPE] PASSWORD_USERS = [*SYSTEM_USERS, "patroni"] @@ -1522,6 +1523,10 @@ def _on_update_status(self, _) -> None: if not self._can_run_on_update_status(): return + if not self._ensure_right_workloads_revisions(): + self.unit.status = BlockedStatus(SNAP_REVISIONS_MISMATCH_MESSAGE) + return + if ( self.is_cluster_restoring_backup or self.is_cluster_restoring_to_time ) and not self._was_restore_successful(): @@ -1547,6 +1552,38 @@ def _on_update_status(self, _) -> None: # Restart topology observer if it is gone self._observer.start_observer() + def _ensure_right_workloads_revisions(self) -> bool: + """Ensure the workloads revisions are the expected ones.""" + right_workloads_revisions = True + for snap_name, snap_version in SNAP_PACKAGES: + try: + snap_cache = snap.SnapCache() + snap_package = snap_cache[snap_name] + if not snap_package.present: + right_workloads_revisions = False + else: + if revision := snap_version.get("revision"): + try: + revision = revision[platform.machine()] + except IndexError: + logger.error("Unavailable snap architecture %s", platform.machine()) + right_workloads_revisions = False + continue + if snap_package.revision != revision: + logger.error("Snap revision mismatch for %s: ", snap_name) + right_workloads_revisions = False + continue + channel = snap_version.get("channel", "") + if channel != "" and snap_package.channel != channel: + logger.error("Snap channel is not the expected one for %s", snap_name) + right_workloads_revisions = False + except (snap.SnapError, snap.SnapNotFoundError) as e: + logger.error( + "An exception occurred when checking %s snap revision. Reason: %s", snap_name, str(e) + ) + right_workloads_revisions = False + return right_workloads_revisions + def _was_restore_successful(self) -> bool: if self.is_cluster_restoring_to_time and all(self.is_pitr_failed()): logger.error( @@ -1616,7 +1653,7 @@ def _can_run_on_update_status(self) -> bool: logger.debug("Early exit on_update_status: upgrade in progress") return False - if self.is_blocked and self.unit.status not in S3_BLOCK_MESSAGES: + if self.is_blocked and self.unit.status not in S3_BLOCK_MESSAGES and self.unit.status != SNAP_REVISIONS_MISMATCH_MESSAGE: # If charm was failing to disable plugin, try again (user may have removed the objects) if self.unit.status.message == EXTENSION_OBJECT_MESSAGE: self.enable_disable_extensions() From a2162f139b839b9264cb067c46680fb584995754 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Wed, 26 Mar 2025 15:53:54 -0300 Subject: [PATCH 02/11] Fix blocked status resolution Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/charm.py b/src/charm.py index 0e2b0b74b8..7abd56fff9 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1653,7 +1653,7 @@ def _can_run_on_update_status(self) -> bool: logger.debug("Early exit on_update_status: upgrade in progress") return False - if self.is_blocked and self.unit.status not in S3_BLOCK_MESSAGES and self.unit.status != SNAP_REVISIONS_MISMATCH_MESSAGE: + if self.is_blocked and self.unit.status not in S3_BLOCK_MESSAGES and self.unit.status.message != SNAP_REVISIONS_MISMATCH_MESSAGE: # If charm was failing to disable plugin, try again (user may have removed the objects) if self.unit.status.message == EXTENSION_OBJECT_MESSAGE: self.enable_disable_extensions() From 2f476354bbe15e30436c72708a01fd22257b0024 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Thu, 27 Mar 2025 09:09:10 -0300 Subject: [PATCH 03/11] Reinitialise stuck replica Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/charm.py b/src/charm.py index 7abd56fff9..490ef9556a 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1532,8 +1532,8 @@ def _on_update_status(self, _) -> None: ) and not self._was_restore_successful(): return - if self._handle_processes_failures(): - return + # if self._handle_processes_failures(): + # return self.postgresql_client_relation.oversee_users() if self.primary_endpoint: @@ -1653,7 +1653,7 @@ def _can_run_on_update_status(self) -> bool: logger.debug("Early exit on_update_status: upgrade in progress") return False - if self.is_blocked and self.unit.status not in S3_BLOCK_MESSAGES and self.unit.status.message != SNAP_REVISIONS_MISMATCH_MESSAGE: + if self.is_blocked and self.unit.status.message not in S3_BLOCK_MESSAGES and self.unit.status.message != SNAP_REVISIONS_MISMATCH_MESSAGE: # If charm was failing to disable plugin, try again (user may have removed the objects) if self.unit.status.message == EXTENSION_OBJECT_MESSAGE: self.enable_disable_extensions() @@ -1713,6 +1713,16 @@ def _handle_workload_failures(self) -> bool: self._patroni.restart_patroni() return True + if ( + not self.has_raft_keys() + and self._unit_ip in self.members_ips + and self._patroni.member_inactive + ): + logger.warning("Inactive member detected. Reinitialising unit.") + self.unit.status = MaintenanceStatus("reinitialising replica") + self._patroni.reinitialize_postgresql() + return True + return False def _set_primary_status_message(self) -> None: From 51eb8a53628f520ed654eec7430f238d3dbd7907 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Thu, 27 Mar 2025 17:41:18 -0300 Subject: [PATCH 04/11] Handle differentiation between frozen process and replication failure Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/charm.py b/src/charm.py index 490ef9556a..bcfe1a3d0b 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1532,8 +1532,8 @@ def _on_update_status(self, _) -> None: ) and not self._was_restore_successful(): return - # if self._handle_processes_failures(): - # return + if self._handle_processes_failures(): + return self.postgresql_client_relation.oversee_users() if self.primary_endpoint: @@ -1671,13 +1671,23 @@ def _handle_processes_failures(self) -> bool: # Restart the PostgreSQL process if it was frozen (in that case, the Patroni # process is running by the PostgreSQL process not). if self._unit_ip in self.members_ips and self._patroni.member_inactive: - try: - self._patroni.restart_patroni() - logger.info("restarted PostgreSQL because it was not running") - return True - except RetryError: - logger.error("failed to restart PostgreSQL after checking that it was not running") - return False + if self._member_name in self._patroni.cluster_members: + try: + logger.warning("Inactive member detected. Reinitialising unit.") + self.unit.status = MaintenanceStatus("reinitialising replica") + self._patroni.reinitialize_postgresql() + return True + except RetryError: + logger.error("failed to restart PostgreSQL after checking that it was not running") + return False + else: + try: + self._patroni.restart_patroni() + logger.info("restarted PostgreSQL because it was not running") + return True + except RetryError: + logger.error("failed to restart PostgreSQL after checking that it was not running") + return False return False @@ -1713,16 +1723,6 @@ def _handle_workload_failures(self) -> bool: self._patroni.restart_patroni() return True - if ( - not self.has_raft_keys() - and self._unit_ip in self.members_ips - and self._patroni.member_inactive - ): - logger.warning("Inactive member detected. Reinitialising unit.") - self.unit.status = MaintenanceStatus("reinitialising replica") - self._patroni.reinitialize_postgresql() - return True - return False def _set_primary_status_message(self) -> None: From 4c58f5ad90b78e61ef245ea5b7e9b990417e9cc7 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Thu, 27 Mar 2025 17:47:38 -0300 Subject: [PATCH 05/11] Fix error message and formatting Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/charm.py b/src/charm.py index bcfe1a3d0b..c8edd306f8 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1579,7 +1579,9 @@ def _ensure_right_workloads_revisions(self) -> bool: right_workloads_revisions = False except (snap.SnapError, snap.SnapNotFoundError) as e: logger.error( - "An exception occurred when checking %s snap revision. Reason: %s", snap_name, str(e) + "An exception occurred when checking %s snap revision. Reason: %s", + snap_name, + str(e), ) right_workloads_revisions = False return right_workloads_revisions @@ -1653,7 +1655,11 @@ def _can_run_on_update_status(self) -> bool: logger.debug("Early exit on_update_status: upgrade in progress") return False - if self.is_blocked and self.unit.status.message not in S3_BLOCK_MESSAGES and self.unit.status.message != SNAP_REVISIONS_MISMATCH_MESSAGE: + if ( + self.is_blocked + and self.unit.status.message not in S3_BLOCK_MESSAGES + and self.unit.status.message != SNAP_REVISIONS_MISMATCH_MESSAGE + ): # If charm was failing to disable plugin, try again (user may have removed the objects) if self.unit.status.message == EXTENSION_OBJECT_MESSAGE: self.enable_disable_extensions() @@ -1678,7 +1684,9 @@ def _handle_processes_failures(self) -> bool: self._patroni.reinitialize_postgresql() return True except RetryError: - logger.error("failed to restart PostgreSQL after checking that it was not running") + logger.error( + "failed to reinitialise PostgreSQL after checking that replica bootstrap failed" + ) return False else: try: @@ -1686,7 +1694,9 @@ def _handle_processes_failures(self) -> bool: logger.info("restarted PostgreSQL because it was not running") return True except RetryError: - logger.error("failed to restart PostgreSQL after checking that it was not running") + logger.error( + "failed to restart PostgreSQL after checking that it was not running" + ) return False return False From 4ac0aaf5417e42e700caca23e6122ba86d19be88 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Mon, 31 Mar 2025 10:02:32 -0300 Subject: [PATCH 06/11] Improve messages with revision numbers and add resolution message Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/charm.py b/src/charm.py index 9c62d96029..8ab0fd8a17 100755 --- a/src/charm.py +++ b/src/charm.py @@ -114,7 +114,7 @@ PRIMARY_NOT_REACHABLE_MESSAGE = "waiting for primary to be reachable from this unit" EXTENSIONS_DEPENDENCY_MESSAGE = "Unsatisfied plugin dependencies. Please check the logs" EXTENSION_OBJECT_MESSAGE = "Cannot disable plugins: Existing objects depend on it. See logs" -SNAP_REVISIONS_MISMATCH_MESSAGE = "Workloads revisions are not the expected ones" +SNAP_REVISIONS_MISMATCH_MESSAGE = "Snap(s) revision(s) mismatch" Scopes = Literal[APP_SCOPE, UNIT_SCOPE] PASSWORD_USERS = [*SYSTEM_USERS, "patroni"] @@ -1549,6 +1549,22 @@ def _on_update_status(self, _) -> None: def _ensure_right_workloads_revisions(self) -> bool: """Ensure the workloads revisions are the expected ones.""" right_workloads_revisions = True + resolution_message = f""" +Check the latest unit shown in the following command, which will be the primary unit: + +sudo -H -u snap_daemon charmed-postgresql.patronictl -c /var/snap/charmed-postgresql/current/etc/patroni/patroni.yaml history {self.app.name} + +Then, run the following commands to fix the issue (first in the primary, then in each replica): + +# List which services are active. +sudo snap services charmed-postgresql + +# Refresh the snap to the right revision. +sudo snap refresh charmed-postgresql --revision EXPECTED-REVISION-NUMBER + +# Start the patroni and the other previously active services again. +sudo snap start charmed-postgresql.patroni + """ for snap_name, snap_version in SNAP_PACKAGES: try: snap_cache = snap.SnapCache() @@ -1564,12 +1580,14 @@ def _ensure_right_workloads_revisions(self) -> bool: right_workloads_revisions = False continue if snap_package.revision != revision: - logger.error("Snap revision mismatch for %s: ", snap_name) + logger.error("Snap revision mismatch for %s: expected %s but found %s", snap_name, revision, snap_package.revision) + logger.error(resolution_message) right_workloads_revisions = False continue channel = snap_version.get("channel", "") if channel != "" and snap_package.channel != channel: - logger.error("Snap channel is not the expected one for %s", snap_name) + logger.error("Snap channel mismatch for %s: expected % but found %s", snap_name, channel, snap_package.channel) + logger.error(resolution_message) right_workloads_revisions = False except (snap.SnapError, snap.SnapNotFoundError) as e: logger.error( From 7f55540ca6d0c2800ccdf92120abf5a82f857890 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Mon, 31 Mar 2025 10:06:52 -0300 Subject: [PATCH 07/11] Fix formatting Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/charm.py b/src/charm.py index 8ab0fd8a17..e88ce987bf 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1580,13 +1580,23 @@ def _ensure_right_workloads_revisions(self) -> bool: right_workloads_revisions = False continue if snap_package.revision != revision: - logger.error("Snap revision mismatch for %s: expected %s but found %s", snap_name, revision, snap_package.revision) + logger.error( + "Snap revision mismatch for %s: expected %s but found %s", + snap_name, + revision, + snap_package.revision, + ) logger.error(resolution_message) right_workloads_revisions = False continue channel = snap_version.get("channel", "") if channel != "" and snap_package.channel != channel: - logger.error("Snap channel mismatch for %s: expected % but found %s", snap_name, channel, snap_package.channel) + logger.error( + "Snap channel mismatch for %s: expected % but found %s", + snap_name, + channel, + snap_package.channel, + ) logger.error(resolution_message) right_workloads_revisions = False except (snap.SnapError, snap.SnapNotFoundError) as e: From 2291ae17d079f3caaa7634c834882828b9bcec53 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Mon, 31 Mar 2025 12:20:50 -0300 Subject: [PATCH 08/11] Improve blocked status message and add snap revision mismatch integration test Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 50 +++++------- .../test_snap_revision_mismatch.py | 76 +++++++++++++++++++ .../test_snap_revision_mismatch/task.yaml | 7 ++ 3 files changed, 102 insertions(+), 31 deletions(-) create mode 100644 tests/integration/test_snap_revision_mismatch.py create mode 100644 tests/spread/test_snap_revision_mismatch/task.yaml diff --git a/src/charm.py b/src/charm.py index e88ce987bf..1bae9e9737 100755 --- a/src/charm.py +++ b/src/charm.py @@ -114,7 +114,7 @@ PRIMARY_NOT_REACHABLE_MESSAGE = "waiting for primary to be reachable from this unit" EXTENSIONS_DEPENDENCY_MESSAGE = "Unsatisfied plugin dependencies. Please check the logs" EXTENSION_OBJECT_MESSAGE = "Cannot disable plugins: Existing objects depend on it. See logs" -SNAP_REVISIONS_MISMATCH_MESSAGE = "Snap(s) revision(s) mismatch" +SNAP_REVISION_MISMATCH_MESSAGE = "Snap revision mismatch" Scopes = Literal[APP_SCOPE, UNIT_SCOPE] PASSWORD_USERS = [*SYSTEM_USERS, "patroni"] @@ -1518,7 +1518,7 @@ def _on_update_status(self, _) -> None: return if not self._ensure_right_workloads_revisions(): - self.unit.status = BlockedStatus(SNAP_REVISIONS_MISMATCH_MESSAGE) + self.unit.status = BlockedStatus(SNAP_REVISION_MISMATCH_MESSAGE) return if ( @@ -1549,22 +1549,6 @@ def _on_update_status(self, _) -> None: def _ensure_right_workloads_revisions(self) -> bool: """Ensure the workloads revisions are the expected ones.""" right_workloads_revisions = True - resolution_message = f""" -Check the latest unit shown in the following command, which will be the primary unit: - -sudo -H -u snap_daemon charmed-postgresql.patronictl -c /var/snap/charmed-postgresql/current/etc/patroni/patroni.yaml history {self.app.name} - -Then, run the following commands to fix the issue (first in the primary, then in each replica): - -# List which services are active. -sudo snap services charmed-postgresql - -# Refresh the snap to the right revision. -sudo snap refresh charmed-postgresql --revision EXPECTED-REVISION-NUMBER - -# Start the patroni and the other previously active services again. -sudo snap start charmed-postgresql.patroni - """ for snap_name, snap_version in SNAP_PACKAGES: try: snap_cache = snap.SnapCache() @@ -1586,18 +1570,22 @@ def _ensure_right_workloads_revisions(self) -> bool: revision, snap_package.revision, ) - logger.error(resolution_message) - right_workloads_revisions = False - continue - channel = snap_version.get("channel", "") - if channel != "" and snap_package.channel != channel: - logger.error( - "Snap channel mismatch for %s: expected % but found %s", - snap_name, - channel, - snap_package.channel, - ) - logger.error(resolution_message) + logger.error(f""" +Check the latest unit shown in the following command, which will be the primary unit: + +sudo -H -u snap_daemon charmed-postgresql.patronictl -c /var/snap/charmed-postgresql/current/etc/patroni/patroni.yaml history {self.app.name} + +Then, run the following commands to fix the issue (first in the primary, then in each replica): + +# List which services are active. +sudo snap services charmed-postgresql + +# Refresh the snap to the right revision. +sudo snap refresh charmed-postgresql --revision EXPECTED-REVISION-NUMBER + +# Start the patroni and the other previously active services again. +sudo snap start charmed-postgresql.patroni + """) right_workloads_revisions = False except (snap.SnapError, snap.SnapNotFoundError) as e: logger.error( @@ -1680,7 +1668,7 @@ def _can_run_on_update_status(self) -> bool: if ( self.is_blocked and self.unit.status.message not in S3_BLOCK_MESSAGES - and self.unit.status.message != SNAP_REVISIONS_MISMATCH_MESSAGE + and self.unit.status.message != SNAP_REVISION_MISMATCH_MESSAGE ): # If charm was failing to disable plugin, try again (user may have removed the objects) if self.unit.status.message == EXTENSION_OBJECT_MESSAGE: diff --git a/tests/integration/test_snap_revision_mismatch.py b/tests/integration/test_snap_revision_mismatch.py new file mode 100644 index 0000000000..e68fa3712d --- /dev/null +++ b/tests/integration/test_snap_revision_mismatch.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. +import logging + +import pytest as pytest +from pytest_operator.plugin import OpsTest + +from constants import POSTGRESQL_SNAP_NAME, SNAP_PACKAGES + +from . import architecture +from .helpers import ( + DATABASE_APP_NAME, + run_command_on_unit, +) + +logger = logging.getLogger(__name__) + + +@pytest.mark.abort_on_fail +async def test_snap_revision_mismatch(ops_test: OpsTest, charm) -> None: + """Test snap revision mismatch.""" + (await ops_test.model.deploy(charm, config={"profile": "testing"}),) + async with ops_test.fast_forward(): + logger.info("Waiting for the database to become active") + await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active") + + unit_name = f"{DATABASE_APP_NAME}/0" + old_snap_revision = "142" if architecture.architecture == "arm64" else "143" + expected_snap_revision = next( + iter([ + snap[1]["revision"][ + architecture.architecture.replace("arm64", "aarch64").replace( + "amd64", "x86_64" + ) + ] + for snap in SNAP_PACKAGES + if snap[0] == POSTGRESQL_SNAP_NAME + ]) + ) + + logger.info( + f"Downgrading {unit_name} snap revision from {expected_snap_revision} to {old_snap_revision}" + ) + await run_command_on_unit( + ops_test, + unit_name, + f"sudo snap refresh charmed-postgresql --revision {old_snap_revision}", + ) + await run_command_on_unit( + ops_test, unit_name, "sudo snap start charmed-postgresql.patroni" + ) + + logger.info("Waiting for the database to become blocked") + application = ops_test.model.applications[DATABASE_APP_NAME] + await ops_test.model.block_until( + lambda: "blocked" in {unit.workload_status for unit in application.units} + ) + assert application.units[0].workload_status_message == "Snap revision mismatch" + + logger.info( + f"Upgrading the snap revision back to the expected one ({expected_snap_revision})" + ) + await run_command_on_unit( + ops_test, + unit_name, + f"sudo snap refresh charmed-postgresql --revision {expected_snap_revision}", + ) + await run_command_on_unit( + ops_test, unit_name, "sudo snap start charmed-postgresql.patroni" + ) + + logger.info("Waiting for the database to become active again") + await ops_test.model.block_until( + lambda: "active" in {unit.workload_status for unit in application.units} + ) diff --git a/tests/spread/test_snap_revision_mismatch/task.yaml b/tests/spread/test_snap_revision_mismatch/task.yaml new file mode 100644 index 0000000000..10c0db7aac --- /dev/null +++ b/tests/spread/test_snap_revision_mismatch/task.yaml @@ -0,0 +1,7 @@ +summary: test_snap_revision_mismatch.py +environment: + TEST_MODULE: test_snap_revision_mismatch.py +execute: | + tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" +artifacts: + - allure-results From a5e1081cf462f11372754b45fb6db27b08e9c2e3 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Mon, 31 Mar 2025 12:47:27 -0300 Subject: [PATCH 09/11] Add failed replica bootstrap integration test Signed-off-by: Marcelo Henrique Neppel --- .../test_failed_replica_bootstrap.py | 53 +++++++++++++++++++ .../test_snap_revision_mismatch.py | 3 +- .../test_failed_replica_bootstrap/task.yaml | 7 +++ 3 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 tests/integration/test_failed_replica_bootstrap.py create mode 100644 tests/spread/test_failed_replica_bootstrap/task.yaml diff --git a/tests/integration/test_failed_replica_bootstrap.py b/tests/integration/test_failed_replica_bootstrap.py new file mode 100644 index 0000000000..5f3485334b --- /dev/null +++ b/tests/integration/test_failed_replica_bootstrap.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. +import logging + +import pytest as pytest +from pytest_operator.plugin import OpsTest + +from .helpers import ( + DATABASE_APP_NAME, + get_primary, + run_command_on_unit, +) + +logger = logging.getLogger(__name__) + + +async def test_failed_replica_bootstrap(ops_test: OpsTest, charm) -> None: + """Test failed replica bootstrap.""" + await ops_test.model.deploy(charm, config={"profile": "testing"}, num_units=3) + async with ops_test.fast_forward(): + logger.info("Waiting for the database to become active") + await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active") + + any_unit = ops_test.model.applications[DATABASE_APP_NAME].units[0].name + primary = await get_primary(ops_test, any_unit) + replica = next( + unit.name + for unit in ops_test.model.applications[DATABASE_APP_NAME].units + if unit.name != primary + ) + + logger.info(f"Removing the pg_control file from {replica} to make it fail") + await run_command_on_unit( + ops_test, + replica, + "sudo rm /var/snap/charmed-postgresql/common/var/lib/postgresql/global/pg_control", + ) + await run_command_on_unit( + ops_test, replica, "sudo snap restart charmed-postgresql.patroni" + ) + + logger.info("Waiting for the database to become in maintenance") + application = ops_test.model.applications[DATABASE_APP_NAME] + await ops_test.model.block_until( + lambda: "maintenance" + in {unit.workload_status for unit in application.units if unit.name == replica} + ) + + logger.info("Waiting for the database to become active again") + await ops_test.model.block_until( + lambda: "active" in {unit.workload_status for unit in application.units} + ) diff --git a/tests/integration/test_snap_revision_mismatch.py b/tests/integration/test_snap_revision_mismatch.py index e68fa3712d..303401ffbe 100644 --- a/tests/integration/test_snap_revision_mismatch.py +++ b/tests/integration/test_snap_revision_mismatch.py @@ -17,10 +17,9 @@ logger = logging.getLogger(__name__) -@pytest.mark.abort_on_fail async def test_snap_revision_mismatch(ops_test: OpsTest, charm) -> None: """Test snap revision mismatch.""" - (await ops_test.model.deploy(charm, config={"profile": "testing"}),) + await ops_test.model.deploy(charm, config={"profile": "testing"}) async with ops_test.fast_forward(): logger.info("Waiting for the database to become active") await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active") diff --git a/tests/spread/test_failed_replica_bootstrap/task.yaml b/tests/spread/test_failed_replica_bootstrap/task.yaml new file mode 100644 index 0000000000..db986d8bf6 --- /dev/null +++ b/tests/spread/test_failed_replica_bootstrap/task.yaml @@ -0,0 +1,7 @@ +summary: test_failed_replica_bootstrap.py +environment: + TEST_MODULE: test_failed_replica_bootstrap.py +execute: | + tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" +artifacts: + - allure-results From 7bab7e3507263ff47c00bbd5759dbc037df6e083 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Mon, 31 Mar 2025 16:53:30 -0300 Subject: [PATCH 10/11] Rename method and fix unit tests Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 6 +++--- tests/unit/test_charm.py | 44 +++++++++++++++++++++++++++++++++------- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/src/charm.py b/src/charm.py index 1bae9e9737..2b75984de9 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1517,7 +1517,7 @@ def _on_update_status(self, _) -> None: if not self._can_run_on_update_status(): return - if not self._ensure_right_workloads_revisions(): + if not self._ensure_correct_snap_revision(): self.unit.status = BlockedStatus(SNAP_REVISION_MISMATCH_MESSAGE) return @@ -1546,8 +1546,8 @@ def _on_update_status(self, _) -> None: # Restart topology observer if it is gone self._observer.start_observer() - def _ensure_right_workloads_revisions(self) -> bool: - """Ensure the workloads revisions are the expected ones.""" + def _ensure_correct_snap_revision(self) -> bool: + """Ensure the snap revision is the expected one.""" right_workloads_revisions = True for snap_name, snap_version in SNAP_PACKAGES: try: diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index e500a8d099..23d5553477 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -931,14 +931,15 @@ def test_on_update_status(harness): ) as _primary_endpoint, patch("charm.PostgreSQLProvider.oversee_users") as _oversee_users, patch("upgrade.PostgreSQLUpgrade.idle", return_value=True), - patch("charm.Patroni.last_postgresql_logs") as _last_postgresql_logs, - patch("charm.Patroni.patroni_logs") as _patroni_logs, patch("charm.Patroni.get_member_status") as _get_member_status, patch( "charm.PostgreSQLBackups.can_use_s3_repository", return_value=(True, None) ) as _can_use_s3_repository, patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, - patch("charm.PostgresqlOperatorCharm.log_pitr_last_transaction_time"), + patch("charm.PostgresqlOperatorCharm._was_restore_successful") as _was_restore_successful, + patch( + "charm.PostgresqlOperatorCharm._ensure_correct_snap_revision" + ) as _ensure_correct_snap_revision, ): rel_id = harness.model.get_relation(PEER).id # Test before the cluster is initialised. @@ -955,7 +956,14 @@ def test_on_update_status(harness): harness.charm.on.update_status.emit() _set_primary_status_message.assert_not_called() + # Test when the snap revision is not correct. + harness.charm.unit.status = ActiveStatus() + _ensure_correct_snap_revision.return_value = False + harness.charm.on.update_status.emit() + _set_primary_status_message.assert_not_called() + # Test the point-in-time-recovery fail. + _ensure_correct_snap_revision.return_value = True with harness.hooks_disabled(): harness.update_relation_data( rel_id, @@ -966,13 +974,11 @@ def test_on_update_status(harness): "restore-to-time": "valid", }, ) - harness.charm.unit.status = ActiveStatus() - _patroni_logs.return_value = "2022-02-24 02:00:00 UTC patroni.exceptions.PatroniFatalException: Failed to bootstrap cluster" + _was_restore_successful.return_value = False harness.charm.on.update_status.emit() _set_primary_status_message.assert_not_called() - assert harness.charm.unit.status.message == CANNOT_RESTORE_PITR - # Test with the unit in a status different that blocked. + # Test when everything is ok. with harness.hooks_disabled(): harness.update_relation_data( rel_id, @@ -1011,6 +1017,29 @@ def test_on_update_status(harness): _start_observer.assert_called_once() +def test_was_restore_successful(harness): + with ( + patch("charm.Patroni.last_postgresql_logs") as _last_postgresql_logs, + patch("charm.Patroni.patroni_logs") as _patroni_logs, + patch("charm.PostgresqlOperatorCharm.log_pitr_last_transaction_time"), + ): + rel_id = harness.model.get_relation(PEER).id + with harness.hooks_disabled(): + harness.update_relation_data( + rel_id, + harness.charm.app.name, + { + "cluster_initialised": "True", + "restoring-backup": "valid", + "restore-to-time": "valid", + }, + ) + harness.charm.unit.status = ActiveStatus() + _patroni_logs.return_value = "2022-02-24 02:00:00 UTC patroni.exceptions.PatroniFatalException: Failed to bootstrap cluster" + assert not harness.charm._was_restore_successful() + assert harness.charm.unit.status.message == CANNOT_RESTORE_PITR + + def test_on_update_status_after_restore_operation(harness): with ( patch("charm.ClusterTopologyObserver.start_observer"), @@ -1039,6 +1068,7 @@ def test_on_update_status_after_restore_operation(harness): patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, patch("charm.Patroni.get_member_status") as _get_member_status, patch("upgrade.PostgreSQLUpgrade.idle", return_value=True), + patch("charm.PostgresqlOperatorCharm._ensure_correct_snap_revision", return_value=True), ): _get_current_timeline.return_value = "2" rel_id = harness.model.get_relation(PEER).id From 711ee0076be923eaff378ab1cb333d322906068a Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Mon, 31 Mar 2025 17:26:35 -0300 Subject: [PATCH 11/11] Add unit test for the _handle_processes_failures method Signed-off-by: Marcelo Henrique Neppel --- tests/unit/test_charm.py | 56 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 23d5553477..64cb62cb87 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -1040,6 +1040,62 @@ def test_was_restore_successful(harness): assert harness.charm.unit.status.message == CANNOT_RESTORE_PITR +def test_handle_processes_failures(harness): + with ( + patch("charm.Patroni.restart_patroni") as _restart_patroni, + patch("charm.Patroni.reinitialize_postgresql") as _reinitialize_postgresql, + patch("charm.Patroni.cluster_members", new_callable=PropertyMock) as _cluster_members, + patch("charm.Patroni.member_inactive", new_callable=PropertyMock) as _member_inactive, + ): + # Test when the member IP is not listed in relation data, or it's active. + rel_id = harness.model.get_relation(PEER).id + member_ip = str(harness.model.get_binding(PEER).network.bind_address) + harness.update_relation_data(rel_id, harness.charm.app.name, {"members_ips": "[]"}) + _member_inactive.return_value = True + assert not harness.charm._handle_processes_failures() + _reinitialize_postgresql.assert_not_called() + _restart_patroni.assert_not_called() + + harness.update_relation_data( + rel_id, harness.charm.app.name, {"members_ips": f'["{member_ip}"]'} + ) + _member_inactive.return_value = False + assert not harness.charm._handle_processes_failures() + _reinitialize_postgresql.assert_not_called() + _restart_patroni.assert_not_called() + + # Test when the member is part of the cluster. + harness.update_relation_data( + rel_id, harness.charm.app.name, {"members_ips": f'["{member_ip}"]'} + ) + _member_inactive.return_value = True + _cluster_members.return_value = [harness.charm.unit.name.replace("/", "-")] + assert harness.charm._handle_processes_failures() + _reinitialize_postgresql.assert_called_once() + _restart_patroni.assert_not_called() + + # Test when the reinitialisation of the member fails. + _reinitialize_postgresql.reset_mock() + _reinitialize_postgresql.side_effect = RetryError(last_attempt=1) + assert not harness.charm._handle_processes_failures() + _reinitialize_postgresql.assert_called_once() + _restart_patroni.assert_not_called() + + # Test when the member is not part of the cluster. + _reinitialize_postgresql.reset_mock() + _cluster_members.return_value = [] + assert harness.charm._handle_processes_failures() + _reinitialize_postgresql.assert_not_called() + _restart_patroni.assert_called_once() + + # Test when the restart of the member fails. + _restart_patroni.reset_mock() + _restart_patroni.side_effect = RetryError(last_attempt=1) + assert not harness.charm._handle_processes_failures() + _reinitialize_postgresql.assert_not_called() + _restart_patroni.assert_called_once() + + def test_on_update_status_after_restore_operation(harness): with ( patch("charm.ClusterTopologyObserver.start_observer"),