Skip to content

[DPE-5931] Fix unit stuck in waiting state due to incomplete refresh #809

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 81 additions & 8 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@
PRIMARY_NOT_REACHABLE_MESSAGE = "waiting for primary to be reachable from this unit"
EXTENSIONS_DEPENDENCY_MESSAGE = "Unsatisfied plugin dependencies. Please check the logs"
EXTENSION_OBJECT_MESSAGE = "Cannot disable plugins: Existing objects depend on it. See logs"
SNAP_REVISION_MISMATCH_MESSAGE = "Snap revision mismatch"

Scopes = Literal[APP_SCOPE, UNIT_SCOPE]
PASSWORD_USERS = [*SYSTEM_USERS, "patroni"]
Expand Down Expand Up @@ -1516,6 +1517,10 @@ def _on_update_status(self, _) -> None:
if not self._can_run_on_update_status():
return

if not self._ensure_correct_snap_revision():
self.unit.status = BlockedStatus(SNAP_REVISION_MISMATCH_MESSAGE)
return

if (
self.is_cluster_restoring_backup or self.is_cluster_restoring_to_time
) and not self._was_restore_successful():
Expand All @@ -1541,6 +1546,56 @@ def _on_update_status(self, _) -> None:
# Restart topology observer if it is gone
self._observer.start_observer()

def _ensure_correct_snap_revision(self) -> bool:
"""Ensure the snap revision is the expected one."""
right_workloads_revisions = True
for snap_name, snap_version in SNAP_PACKAGES:
try:
snap_cache = snap.SnapCache()
snap_package = snap_cache[snap_name]
if not snap_package.present:
right_workloads_revisions = False
else:
if revision := snap_version.get("revision"):
try:
revision = revision[platform.machine()]
except IndexError:
logger.error("Unavailable snap architecture %s", platform.machine())
right_workloads_revisions = False
continue
if snap_package.revision != revision:
logger.error(
"Snap revision mismatch for %s: expected %s but found %s",
snap_name,
revision,
snap_package.revision,
)
logger.error(f"""
Check the latest unit shown in the following command, which will be the primary unit:

sudo -H -u snap_daemon charmed-postgresql.patronictl -c /var/snap/charmed-postgresql/current/etc/patroni/patroni.yaml history {self.app.name}

Then, run the following commands to fix the issue (first in the primary, then in each replica):

# List which services are active.
sudo snap services charmed-postgresql

# Refresh the snap to the right revision.
sudo snap refresh charmed-postgresql --revision EXPECTED-REVISION-NUMBER

# Start the patroni and the other previously active services again.
sudo snap start charmed-postgresql.patroni
""")
right_workloads_revisions = False
except (snap.SnapError, snap.SnapNotFoundError) as e:
logger.error(
"An exception occurred when checking %s snap revision. Reason: %s",
snap_name,
str(e),
)
right_workloads_revisions = False
return right_workloads_revisions

def _was_restore_successful(self) -> bool:
if self.is_cluster_restoring_to_time and all(self.is_pitr_failed()):
logger.error(
Expand Down Expand Up @@ -1610,7 +1665,11 @@ def _can_run_on_update_status(self) -> bool:
logger.debug("Early exit on_update_status: upgrade in progress")
return False

if self.is_blocked and self.unit.status not in S3_BLOCK_MESSAGES:
if (
self.is_blocked
and self.unit.status.message not in S3_BLOCK_MESSAGES
and self.unit.status.message != SNAP_REVISION_MISMATCH_MESSAGE
):
# If charm was failing to disable plugin, try again (user may have removed the objects)
if self.unit.status.message == EXTENSION_OBJECT_MESSAGE:
self.enable_disable_extensions()
Expand All @@ -1628,13 +1687,27 @@ def _handle_processes_failures(self) -> bool:
# Restart the PostgreSQL process if it was frozen (in that case, the Patroni
# process is running by the PostgreSQL process not).
if self._unit_ip in self.members_ips and self._patroni.member_inactive:
try:
self._patroni.restart_patroni()
logger.info("restarted PostgreSQL because it was not running")
return True
except RetryError:
logger.error("failed to restart PostgreSQL after checking that it was not running")
return False
if self._member_name in self._patroni.cluster_members:
try:
logger.warning("Inactive member detected. Reinitialising unit.")
self.unit.status = MaintenanceStatus("reinitialising replica")
self._patroni.reinitialize_postgresql()
return True
except RetryError:
logger.error(
"failed to reinitialise PostgreSQL after checking that replica bootstrap failed"
)
return False
else:
try:
self._patroni.restart_patroni()
logger.info("restarted PostgreSQL because it was not running")
return True
except RetryError:
logger.error(
"failed to restart PostgreSQL after checking that it was not running"
)
return False

return False

Expand Down
53 changes: 53 additions & 0 deletions tests/integration/test_failed_replica_bootstrap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env python3
# Copyright 2025 Canonical Ltd.
# See LICENSE file for licensing details.
import logging

import pytest as pytest
from pytest_operator.plugin import OpsTest

from .helpers import (
DATABASE_APP_NAME,
get_primary,
run_command_on_unit,
)

logger = logging.getLogger(__name__)


async def test_failed_replica_bootstrap(ops_test: OpsTest, charm) -> None:
"""Test failed replica bootstrap."""
await ops_test.model.deploy(charm, config={"profile": "testing"}, num_units=3)
async with ops_test.fast_forward():
logger.info("Waiting for the database to become active")
await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active")

any_unit = ops_test.model.applications[DATABASE_APP_NAME].units[0].name
primary = await get_primary(ops_test, any_unit)
replica = next(
unit.name
for unit in ops_test.model.applications[DATABASE_APP_NAME].units
if unit.name != primary
)

logger.info(f"Removing the pg_control file from {replica} to make it fail")
await run_command_on_unit(
ops_test,
replica,
"sudo rm /var/snap/charmed-postgresql/common/var/lib/postgresql/global/pg_control",
)
await run_command_on_unit(
ops_test, replica, "sudo snap restart charmed-postgresql.patroni"
)

logger.info("Waiting for the database to become in maintenance")
application = ops_test.model.applications[DATABASE_APP_NAME]
await ops_test.model.block_until(
lambda: "maintenance"
in {unit.workload_status for unit in application.units if unit.name == replica}
)

logger.info("Waiting for the database to become active again")
await ops_test.model.block_until(
lambda: "active" in {unit.workload_status for unit in application.units}
)
75 changes: 75 additions & 0 deletions tests/integration/test_snap_revision_mismatch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python3
# Copyright 2025 Canonical Ltd.
# See LICENSE file for licensing details.
import logging

import pytest as pytest
from pytest_operator.plugin import OpsTest

from constants import POSTGRESQL_SNAP_NAME, SNAP_PACKAGES

from . import architecture
from .helpers import (
DATABASE_APP_NAME,
run_command_on_unit,
)

logger = logging.getLogger(__name__)


async def test_snap_revision_mismatch(ops_test: OpsTest, charm) -> None:
"""Test snap revision mismatch."""
await ops_test.model.deploy(charm, config={"profile": "testing"})
async with ops_test.fast_forward():
logger.info("Waiting for the database to become active")
await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active")

unit_name = f"{DATABASE_APP_NAME}/0"
old_snap_revision = "142" if architecture.architecture == "arm64" else "143"
expected_snap_revision = next(
iter([
snap[1]["revision"][
architecture.architecture.replace("arm64", "aarch64").replace(
"amd64", "x86_64"
)
]
for snap in SNAP_PACKAGES
if snap[0] == POSTGRESQL_SNAP_NAME
])
)

logger.info(
f"Downgrading {unit_name} snap revision from {expected_snap_revision} to {old_snap_revision}"
)
await run_command_on_unit(
ops_test,
unit_name,
f"sudo snap refresh charmed-postgresql --revision {old_snap_revision}",
)
await run_command_on_unit(
ops_test, unit_name, "sudo snap start charmed-postgresql.patroni"
)

logger.info("Waiting for the database to become blocked")
application = ops_test.model.applications[DATABASE_APP_NAME]
await ops_test.model.block_until(
lambda: "blocked" in {unit.workload_status for unit in application.units}
)
assert application.units[0].workload_status_message == "Snap revision mismatch"

logger.info(
f"Upgrading the snap revision back to the expected one ({expected_snap_revision})"
)
await run_command_on_unit(
ops_test,
unit_name,
f"sudo snap refresh charmed-postgresql --revision {expected_snap_revision}",
)
await run_command_on_unit(
ops_test, unit_name, "sudo snap start charmed-postgresql.patroni"
)

logger.info("Waiting for the database to become active again")
await ops_test.model.block_until(
lambda: "active" in {unit.workload_status for unit in application.units}
)
7 changes: 7 additions & 0 deletions tests/spread/test_failed_replica_bootstrap/task.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
summary: test_failed_replica_bootstrap.py
environment:
TEST_MODULE: test_failed_replica_bootstrap.py
execute: |
tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results"
artifacts:
- allure-results
7 changes: 7 additions & 0 deletions tests/spread/test_snap_revision_mismatch/task.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
summary: test_snap_revision_mismatch.py
environment:
TEST_MODULE: test_snap_revision_mismatch.py
execute: |
tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results"
artifacts:
- allure-results
Loading
Loading