diff --git a/src/charm.py b/src/charm.py index ad4f6c2d1ab..2b75984de95 100755 --- a/src/charm.py +++ b/src/charm.py @@ -114,6 +114,7 @@ PRIMARY_NOT_REACHABLE_MESSAGE = "waiting for primary to be reachable from this unit" EXTENSIONS_DEPENDENCY_MESSAGE = "Unsatisfied plugin dependencies. Please check the logs" EXTENSION_OBJECT_MESSAGE = "Cannot disable plugins: Existing objects depend on it. See logs" +SNAP_REVISION_MISMATCH_MESSAGE = "Snap revision mismatch" Scopes = Literal[APP_SCOPE, UNIT_SCOPE] PASSWORD_USERS = [*SYSTEM_USERS, "patroni"] @@ -1516,6 +1517,10 @@ def _on_update_status(self, _) -> None: if not self._can_run_on_update_status(): return + if not self._ensure_correct_snap_revision(): + self.unit.status = BlockedStatus(SNAP_REVISION_MISMATCH_MESSAGE) + return + if ( self.is_cluster_restoring_backup or self.is_cluster_restoring_to_time ) and not self._was_restore_successful(): @@ -1541,6 +1546,56 @@ def _on_update_status(self, _) -> None: # Restart topology observer if it is gone self._observer.start_observer() + def _ensure_correct_snap_revision(self) -> bool: + """Ensure the snap revision is the expected one.""" + right_workloads_revisions = True + for snap_name, snap_version in SNAP_PACKAGES: + try: + snap_cache = snap.SnapCache() + snap_package = snap_cache[snap_name] + if not snap_package.present: + right_workloads_revisions = False + else: + if revision := snap_version.get("revision"): + try: + revision = revision[platform.machine()] + except IndexError: + logger.error("Unavailable snap architecture %s", platform.machine()) + right_workloads_revisions = False + continue + if snap_package.revision != revision: + logger.error( + "Snap revision mismatch for %s: expected %s but found %s", + snap_name, + revision, + snap_package.revision, + ) + logger.error(f""" +Check the latest unit shown in the following command, which will be the primary unit: + +sudo -H -u snap_daemon charmed-postgresql.patronictl -c /var/snap/charmed-postgresql/current/etc/patroni/patroni.yaml history {self.app.name} + +Then, run the following commands to fix the issue (first in the primary, then in each replica): + +# List which services are active. +sudo snap services charmed-postgresql + +# Refresh the snap to the right revision. +sudo snap refresh charmed-postgresql --revision EXPECTED-REVISION-NUMBER + +# Start the patroni and the other previously active services again. +sudo snap start charmed-postgresql.patroni + """) + right_workloads_revisions = False + except (snap.SnapError, snap.SnapNotFoundError) as e: + logger.error( + "An exception occurred when checking %s snap revision. Reason: %s", + snap_name, + str(e), + ) + right_workloads_revisions = False + return right_workloads_revisions + def _was_restore_successful(self) -> bool: if self.is_cluster_restoring_to_time and all(self.is_pitr_failed()): logger.error( @@ -1610,7 +1665,11 @@ def _can_run_on_update_status(self) -> bool: logger.debug("Early exit on_update_status: upgrade in progress") return False - if self.is_blocked and self.unit.status not in S3_BLOCK_MESSAGES: + if ( + self.is_blocked + and self.unit.status.message not in S3_BLOCK_MESSAGES + and self.unit.status.message != SNAP_REVISION_MISMATCH_MESSAGE + ): # If charm was failing to disable plugin, try again (user may have removed the objects) if self.unit.status.message == EXTENSION_OBJECT_MESSAGE: self.enable_disable_extensions() @@ -1628,13 +1687,27 @@ def _handle_processes_failures(self) -> bool: # Restart the PostgreSQL process if it was frozen (in that case, the Patroni # process is running by the PostgreSQL process not). if self._unit_ip in self.members_ips and self._patroni.member_inactive: - try: - self._patroni.restart_patroni() - logger.info("restarted PostgreSQL because it was not running") - return True - except RetryError: - logger.error("failed to restart PostgreSQL after checking that it was not running") - return False + if self._member_name in self._patroni.cluster_members: + try: + logger.warning("Inactive member detected. Reinitialising unit.") + self.unit.status = MaintenanceStatus("reinitialising replica") + self._patroni.reinitialize_postgresql() + return True + except RetryError: + logger.error( + "failed to reinitialise PostgreSQL after checking that replica bootstrap failed" + ) + return False + else: + try: + self._patroni.restart_patroni() + logger.info("restarted PostgreSQL because it was not running") + return True + except RetryError: + logger.error( + "failed to restart PostgreSQL after checking that it was not running" + ) + return False return False diff --git a/tests/integration/test_failed_replica_bootstrap.py b/tests/integration/test_failed_replica_bootstrap.py new file mode 100644 index 00000000000..5f3485334b7 --- /dev/null +++ b/tests/integration/test_failed_replica_bootstrap.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. +import logging + +import pytest as pytest +from pytest_operator.plugin import OpsTest + +from .helpers import ( + DATABASE_APP_NAME, + get_primary, + run_command_on_unit, +) + +logger = logging.getLogger(__name__) + + +async def test_failed_replica_bootstrap(ops_test: OpsTest, charm) -> None: + """Test failed replica bootstrap.""" + await ops_test.model.deploy(charm, config={"profile": "testing"}, num_units=3) + async with ops_test.fast_forward(): + logger.info("Waiting for the database to become active") + await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active") + + any_unit = ops_test.model.applications[DATABASE_APP_NAME].units[0].name + primary = await get_primary(ops_test, any_unit) + replica = next( + unit.name + for unit in ops_test.model.applications[DATABASE_APP_NAME].units + if unit.name != primary + ) + + logger.info(f"Removing the pg_control file from {replica} to make it fail") + await run_command_on_unit( + ops_test, + replica, + "sudo rm /var/snap/charmed-postgresql/common/var/lib/postgresql/global/pg_control", + ) + await run_command_on_unit( + ops_test, replica, "sudo snap restart charmed-postgresql.patroni" + ) + + logger.info("Waiting for the database to become in maintenance") + application = ops_test.model.applications[DATABASE_APP_NAME] + await ops_test.model.block_until( + lambda: "maintenance" + in {unit.workload_status for unit in application.units if unit.name == replica} + ) + + logger.info("Waiting for the database to become active again") + await ops_test.model.block_until( + lambda: "active" in {unit.workload_status for unit in application.units} + ) diff --git a/tests/integration/test_snap_revision_mismatch.py b/tests/integration/test_snap_revision_mismatch.py new file mode 100644 index 00000000000..303401ffbe0 --- /dev/null +++ b/tests/integration/test_snap_revision_mismatch.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. +import logging + +import pytest as pytest +from pytest_operator.plugin import OpsTest + +from constants import POSTGRESQL_SNAP_NAME, SNAP_PACKAGES + +from . import architecture +from .helpers import ( + DATABASE_APP_NAME, + run_command_on_unit, +) + +logger = logging.getLogger(__name__) + + +async def test_snap_revision_mismatch(ops_test: OpsTest, charm) -> None: + """Test snap revision mismatch.""" + await ops_test.model.deploy(charm, config={"profile": "testing"}) + async with ops_test.fast_forward(): + logger.info("Waiting for the database to become active") + await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active") + + unit_name = f"{DATABASE_APP_NAME}/0" + old_snap_revision = "142" if architecture.architecture == "arm64" else "143" + expected_snap_revision = next( + iter([ + snap[1]["revision"][ + architecture.architecture.replace("arm64", "aarch64").replace( + "amd64", "x86_64" + ) + ] + for snap in SNAP_PACKAGES + if snap[0] == POSTGRESQL_SNAP_NAME + ]) + ) + + logger.info( + f"Downgrading {unit_name} snap revision from {expected_snap_revision} to {old_snap_revision}" + ) + await run_command_on_unit( + ops_test, + unit_name, + f"sudo snap refresh charmed-postgresql --revision {old_snap_revision}", + ) + await run_command_on_unit( + ops_test, unit_name, "sudo snap start charmed-postgresql.patroni" + ) + + logger.info("Waiting for the database to become blocked") + application = ops_test.model.applications[DATABASE_APP_NAME] + await ops_test.model.block_until( + lambda: "blocked" in {unit.workload_status for unit in application.units} + ) + assert application.units[0].workload_status_message == "Snap revision mismatch" + + logger.info( + f"Upgrading the snap revision back to the expected one ({expected_snap_revision})" + ) + await run_command_on_unit( + ops_test, + unit_name, + f"sudo snap refresh charmed-postgresql --revision {expected_snap_revision}", + ) + await run_command_on_unit( + ops_test, unit_name, "sudo snap start charmed-postgresql.patroni" + ) + + logger.info("Waiting for the database to become active again") + await ops_test.model.block_until( + lambda: "active" in {unit.workload_status for unit in application.units} + ) diff --git a/tests/spread/test_failed_replica_bootstrap/task.yaml b/tests/spread/test_failed_replica_bootstrap/task.yaml new file mode 100644 index 00000000000..db986d8bf64 --- /dev/null +++ b/tests/spread/test_failed_replica_bootstrap/task.yaml @@ -0,0 +1,7 @@ +summary: test_failed_replica_bootstrap.py +environment: + TEST_MODULE: test_failed_replica_bootstrap.py +execute: | + tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" +artifacts: + - allure-results diff --git a/tests/spread/test_snap_revision_mismatch/task.yaml b/tests/spread/test_snap_revision_mismatch/task.yaml new file mode 100644 index 00000000000..10c0db7aac1 --- /dev/null +++ b/tests/spread/test_snap_revision_mismatch/task.yaml @@ -0,0 +1,7 @@ +summary: test_snap_revision_mismatch.py +environment: + TEST_MODULE: test_snap_revision_mismatch.py +execute: | + tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" +artifacts: + - allure-results diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index e500a8d099c..64cb62cb87d 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -931,14 +931,15 @@ def test_on_update_status(harness): ) as _primary_endpoint, patch("charm.PostgreSQLProvider.oversee_users") as _oversee_users, patch("upgrade.PostgreSQLUpgrade.idle", return_value=True), - patch("charm.Patroni.last_postgresql_logs") as _last_postgresql_logs, - patch("charm.Patroni.patroni_logs") as _patroni_logs, patch("charm.Patroni.get_member_status") as _get_member_status, patch( "charm.PostgreSQLBackups.can_use_s3_repository", return_value=(True, None) ) as _can_use_s3_repository, patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, - patch("charm.PostgresqlOperatorCharm.log_pitr_last_transaction_time"), + patch("charm.PostgresqlOperatorCharm._was_restore_successful") as _was_restore_successful, + patch( + "charm.PostgresqlOperatorCharm._ensure_correct_snap_revision" + ) as _ensure_correct_snap_revision, ): rel_id = harness.model.get_relation(PEER).id # Test before the cluster is initialised. @@ -955,7 +956,14 @@ def test_on_update_status(harness): harness.charm.on.update_status.emit() _set_primary_status_message.assert_not_called() + # Test when the snap revision is not correct. + harness.charm.unit.status = ActiveStatus() + _ensure_correct_snap_revision.return_value = False + harness.charm.on.update_status.emit() + _set_primary_status_message.assert_not_called() + # Test the point-in-time-recovery fail. + _ensure_correct_snap_revision.return_value = True with harness.hooks_disabled(): harness.update_relation_data( rel_id, @@ -966,13 +974,11 @@ def test_on_update_status(harness): "restore-to-time": "valid", }, ) - harness.charm.unit.status = ActiveStatus() - _patroni_logs.return_value = "2022-02-24 02:00:00 UTC patroni.exceptions.PatroniFatalException: Failed to bootstrap cluster" + _was_restore_successful.return_value = False harness.charm.on.update_status.emit() _set_primary_status_message.assert_not_called() - assert harness.charm.unit.status.message == CANNOT_RESTORE_PITR - # Test with the unit in a status different that blocked. + # Test when everything is ok. with harness.hooks_disabled(): harness.update_relation_data( rel_id, @@ -1011,6 +1017,85 @@ def test_on_update_status(harness): _start_observer.assert_called_once() +def test_was_restore_successful(harness): + with ( + patch("charm.Patroni.last_postgresql_logs") as _last_postgresql_logs, + patch("charm.Patroni.patroni_logs") as _patroni_logs, + patch("charm.PostgresqlOperatorCharm.log_pitr_last_transaction_time"), + ): + rel_id = harness.model.get_relation(PEER).id + with harness.hooks_disabled(): + harness.update_relation_data( + rel_id, + harness.charm.app.name, + { + "cluster_initialised": "True", + "restoring-backup": "valid", + "restore-to-time": "valid", + }, + ) + harness.charm.unit.status = ActiveStatus() + _patroni_logs.return_value = "2022-02-24 02:00:00 UTC patroni.exceptions.PatroniFatalException: Failed to bootstrap cluster" + assert not harness.charm._was_restore_successful() + assert harness.charm.unit.status.message == CANNOT_RESTORE_PITR + + +def test_handle_processes_failures(harness): + with ( + patch("charm.Patroni.restart_patroni") as _restart_patroni, + patch("charm.Patroni.reinitialize_postgresql") as _reinitialize_postgresql, + patch("charm.Patroni.cluster_members", new_callable=PropertyMock) as _cluster_members, + patch("charm.Patroni.member_inactive", new_callable=PropertyMock) as _member_inactive, + ): + # Test when the member IP is not listed in relation data, or it's active. + rel_id = harness.model.get_relation(PEER).id + member_ip = str(harness.model.get_binding(PEER).network.bind_address) + harness.update_relation_data(rel_id, harness.charm.app.name, {"members_ips": "[]"}) + _member_inactive.return_value = True + assert not harness.charm._handle_processes_failures() + _reinitialize_postgresql.assert_not_called() + _restart_patroni.assert_not_called() + + harness.update_relation_data( + rel_id, harness.charm.app.name, {"members_ips": f'["{member_ip}"]'} + ) + _member_inactive.return_value = False + assert not harness.charm._handle_processes_failures() + _reinitialize_postgresql.assert_not_called() + _restart_patroni.assert_not_called() + + # Test when the member is part of the cluster. + harness.update_relation_data( + rel_id, harness.charm.app.name, {"members_ips": f'["{member_ip}"]'} + ) + _member_inactive.return_value = True + _cluster_members.return_value = [harness.charm.unit.name.replace("/", "-")] + assert harness.charm._handle_processes_failures() + _reinitialize_postgresql.assert_called_once() + _restart_patroni.assert_not_called() + + # Test when the reinitialisation of the member fails. + _reinitialize_postgresql.reset_mock() + _reinitialize_postgresql.side_effect = RetryError(last_attempt=1) + assert not harness.charm._handle_processes_failures() + _reinitialize_postgresql.assert_called_once() + _restart_patroni.assert_not_called() + + # Test when the member is not part of the cluster. + _reinitialize_postgresql.reset_mock() + _cluster_members.return_value = [] + assert harness.charm._handle_processes_failures() + _reinitialize_postgresql.assert_not_called() + _restart_patroni.assert_called_once() + + # Test when the restart of the member fails. + _restart_patroni.reset_mock() + _restart_patroni.side_effect = RetryError(last_attempt=1) + assert not harness.charm._handle_processes_failures() + _reinitialize_postgresql.assert_not_called() + _restart_patroni.assert_called_once() + + def test_on_update_status_after_restore_operation(harness): with ( patch("charm.ClusterTopologyObserver.start_observer"), @@ -1039,6 +1124,7 @@ def test_on_update_status_after_restore_operation(harness): patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, patch("charm.Patroni.get_member_status") as _get_member_status, patch("upgrade.PostgreSQLUpgrade.idle", return_value=True), + patch("charm.PostgresqlOperatorCharm._ensure_correct_snap_revision", return_value=True), ): _get_current_timeline.return_value = "2" rel_id = harness.model.get_relation(PEER).id