diff --git a/.github/renovate.json5 b/.github/renovate.json5 index 34085c9225..cd60ef68a5 100644 --- a/.github/renovate.json5 +++ b/.github/renovate.json5 @@ -6,6 +6,7 @@ reviewers: [ 'team:data-platform-postgresql', ], + "baseBranches": ["main", "/^*\\/edge$/"], packageRules: [ { matchPackageNames: [ @@ -13,32 +14,7 @@ ], allowedVersions: '<2.0.0', }, - { - matchManagers: [ - 'custom.regex', - ], - matchDepNames: [ - 'juju', - ], - matchDatasources: [ - 'pypi', - ], - allowedVersions: '<3', - groupName: 'Juju agents', - }, ], customManagers: [ - { - customType: 'regex', - fileMatch: [ - '^\\.github/workflows/[^/]+\\.ya?ml$', - ], - matchStrings: [ - '(libjuju: )==(?.*?) +# renovate: latest libjuju 2', - ], - depNameTemplate: 'juju', - datasourceTemplate: 'pypi', - versioningTemplate: 'loose', - }, ], } diff --git a/.github/workflows/check_pr.yaml b/.github/workflows/check_pr.yaml index beaa1541a3..ce3d160399 100644 --- a/.github/workflows/check_pr.yaml +++ b/.github/workflows/check_pr.yaml @@ -11,6 +11,7 @@ on: - edited branches: - main + - '*/edge' jobs: check-pr: diff --git a/src/charm.py b/src/charm.py index 30ca8117a9..ad4f6c2d1a 100755 --- a/src/charm.py +++ b/src/charm.py @@ -385,30 +385,22 @@ def primary_endpoint(self) -> str | None: logger.debug("primary endpoint early exit: Peer relation not joined yet.") return None try: - for attempt in Retrying(stop=stop_after_delay(5), wait=wait_fixed(3)): - with attempt: - primary = self._patroni.get_primary() - if primary is None and (standby_leader := self._patroni.get_standby_leader()): - primary = standby_leader - primary_endpoint = self._patroni.get_member_ip(primary) - # Force a retry if there is no primary or the member that was - # returned is not in the list of the current cluster members - # (like when the cluster was not updated yet after a failed switchover). - if not primary_endpoint or primary_endpoint not in self._units_ips: - # TODO figure out why peer data is not available - if ( - primary_endpoint - and len(self._units_ips) == 1 - and len(self._peers.units) > 1 - ): - logger.warning( - "Possibly incoplete peer data: Will not map primary IP to unit IP" - ) - return primary_endpoint - logger.debug( - "primary endpoint early exit: Primary IP not in cached peer list." - ) - primary_endpoint = None + primary = self._patroni.get_primary() + if primary is None and (standby_leader := self._patroni.get_standby_leader()): + primary = standby_leader + primary_endpoint = self._patroni.get_member_ip(primary) + # Force a retry if there is no primary or the member that was + # returned is not in the list of the current cluster members + # (like when the cluster was not updated yet after a failed switchover). + if not primary_endpoint or primary_endpoint not in self._units_ips: + # TODO figure out why peer data is not available + if primary_endpoint and len(self._units_ips) == 1 and len(self._peers.units) > 1: + logger.warning( + "Possibly incoplete peer data: Will not map primary IP to unit IP" + ) + return primary_endpoint + logger.debug("primary endpoint early exit: Primary IP not in cached peer list.") + primary_endpoint = None except RetryError: return None else: @@ -952,6 +944,8 @@ def _units_ips(self) -> set[str]: # Get all members IPs and remove the current unit IP from the list. addresses = {self._get_unit_ip(unit) for unit in self._peers.units} addresses.add(self._unit_ip) + if None in addresses: + addresses.remove(None) return addresses @property diff --git a/src/cluster.py b/src/cluster.py index b321a4cac4..b5af840169 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -21,7 +21,6 @@ from ops import BlockedStatus from pysyncobj.utility import TcpUtility, UtilityException from tenacity import ( - AttemptManager, RetryError, Retrying, retry, @@ -233,11 +232,12 @@ def get_member_ip(self, member_name: str) -> str: IP address of the cluster member. """ # Request info from cluster endpoint (which returns all members of the cluster). - for attempt in Retrying(stop=stop_after_attempt(2 * len(self.peers_ips) + 1)): + attempts = 2 * len(self.peers_ips) + 2 + urls = self._get_alternative_patroni_url() + for attempt in Retrying(stop=stop_after_attempt(attempts)): with attempt: - url = self._get_alternative_patroni_url(attempt) cluster_status = requests.get( - f"{url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}", + f"{next(urls)}/{PATRONI_CLUSTER_STATUS_ENDPOINT}", verify=self.verify, timeout=API_REQUEST_TIMEOUT, auth=self._patroni_auth, @@ -257,11 +257,12 @@ def get_member_status(self, member_name: str) -> str: couldn't be retrieved yet. """ # Request info from cluster endpoint (which returns all members of the cluster). - for attempt in Retrying(stop=stop_after_attempt(2 * len(self.peers_ips) + 1)): + attempts = 2 * len(self.peers_ips) + 2 + urls = self._get_alternative_patroni_url() + for attempt in Retrying(stop=stop_after_attempt(attempts)): with attempt: - url = self._get_alternative_patroni_url(attempt) cluster_status = requests.get( - f"{url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}", + f"{next(urls)}/{PATRONI_CLUSTER_STATUS_ENDPOINT}", verify=self.verify, timeout=API_REQUEST_TIMEOUT, auth=self._patroni_auth, @@ -284,11 +285,15 @@ def get_primary( primary pod or unit name. """ # Request info from cluster endpoint (which returns all members of the cluster). - for attempt in Retrying(stop=stop_after_attempt(2 * len(self.peers_ips) + 1)): + if alternative_endpoints: + attempts = 2 * len(alternative_endpoints) + 2 + else: + attempts = 2 * len(self.peers_ips) + 2 + urls = self._get_alternative_patroni_url(alternative_endpoints) + for attempt in Retrying(stop=stop_after_attempt(attempts)): with attempt: - url = self._get_alternative_patroni_url(attempt, alternative_endpoints) cluster_status = requests.get( - f"{url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}", + f"{next(urls)}/{PATRONI_CLUSTER_STATUS_ENDPOINT}", verify=self.verify, timeout=API_REQUEST_TIMEOUT, auth=self._patroni_auth, @@ -314,11 +319,12 @@ def get_standby_leader( standby leader pod or unit name. """ # Request info from cluster endpoint (which returns all members of the cluster). - for attempt in Retrying(stop=stop_after_attempt(2 * len(self.peers_ips) + 1)): + attempts = 2 * len(self.peers_ips) + 2 + urls = self._get_alternative_patroni_url() + for attempt in Retrying(stop=stop_after_attempt(attempts)): with attempt: - url = self._get_alternative_patroni_url(attempt) cluster_status = requests.get( - f"{url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}", + f"{next(urls)}/{PATRONI_CLUSTER_STATUS_ENDPOINT}", verify=self.verify, timeout=API_REQUEST_TIMEOUT, auth=self._patroni_auth, @@ -338,11 +344,12 @@ def get_sync_standby_names(self) -> list[str]: """Get the list of sync standby unit names.""" sync_standbys = [] # Request info from cluster endpoint (which returns all members of the cluster). - for attempt in Retrying(stop=stop_after_attempt(2 * len(self.peers_ips) + 1)): + attempts = 2 * len(self.peers_ips) + 2 + urls = self._get_alternative_patroni_url() + for attempt in Retrying(stop=stop_after_attempt(attempts)): with attempt: - url = self._get_alternative_patroni_url(attempt) r = requests.get( - f"{url}/cluster", + f"{next(urls)}/cluster", verify=self.verify, auth=self._patroni_auth, timeout=PATRONI_TIMEOUT, @@ -352,33 +359,26 @@ def get_sync_standby_names(self) -> list[str]: sync_standbys.append("/".join(member["name"].rsplit("-", 1))) return sync_standbys - def _get_alternative_patroni_url( - self, attempt: AttemptManager, alternative_endpoints: list[str] | None = None - ) -> str: + def _get_alternative_patroni_url(self, alternative_endpoints: list[str] | None = None) -> str: """Get an alternative REST API URL from another member each time. When the Patroni process is not running in the current unit it's needed to use a URL from another cluster member REST API to do some operations. """ - if alternative_endpoints is not None: - return self._patroni_url.replace( - self.unit_ip, alternative_endpoints[attempt.retry_state.attempt_number - 1] - ) - attempt_number = attempt.retry_state.attempt_number - if attempt_number > 1: - url = self._patroni_url - # Build the URL using http and later using https for each peer. - if (attempt_number - 1) <= len(self.peers_ips): - url = url.replace("https://", "http://") - unit_number = attempt_number - 2 - else: - url = url.replace("http://", "https://") - unit_number = attempt_number - 2 - len(self.peers_ips) - other_unit_ip = list(self.peers_ips)[unit_number] - url = url.replace(self.unit_ip, other_unit_ip) + if not alternative_endpoints: + alternative_endpoints = list(self.peers_ips) + urls = [self._patroni_url] + urls += [ + self._patroni_url.replace(self.unit_ip, endpoint) for endpoint in alternative_endpoints + ] + if self.tls_enabled: + default_schema = "https" + new_schema = "http" else: - url = self._patroni_url - return url + default_schema = "http" + new_schema = "https" + urls += [url.replace(default_schema, new_schema) for url in urls] + yield from urls def are_all_members_ready(self) -> bool: """Check if all members are correctly running Patroni and PostgreSQL. diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 7de1a502cf..e500a8d099 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -177,8 +177,6 @@ def test_patroni_scrape_config_tls(harness): def test_primary_endpoint(harness): with ( - patch("charm.stop_after_delay", new_callable=PropertyMock) as _stop_after_delay, - patch("charm.wait_fixed", new_callable=PropertyMock) as _wait_fixed, patch( "charm.PostgresqlOperatorCharm._units_ips", new_callable=PropertyMock, @@ -190,10 +188,6 @@ def test_primary_endpoint(harness): _patroni.return_value.get_primary.return_value = sentinel.primary assert harness.charm.primary_endpoint == "1.1.1.1" - # Check needed to ensure a fast charm deployment. - _stop_after_delay.assert_called_once_with(5) - _wait_fixed.assert_called_once_with(3) - _patroni.return_value.get_member_ip.assert_called_once_with(sentinel.primary) _patroni.return_value.get_primary.assert_called_once_with() diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py index 7dd1e1ddf7..6bcf643f7f 100644 --- a/tests/unit/test_cluster.py +++ b/tests/unit/test_cluster.py @@ -11,8 +11,6 @@ from ops.testing import Harness from pysyncobj.utility import UtilityException from tenacity import ( - AttemptManager, - RetryCallState, RetryError, Retrying, stop_after_delay, @@ -95,49 +93,59 @@ def patroni(harness, peers_ips): def test_get_alternative_patroni_url(peers_ips, patroni): - # Mock tenacity attempt. - retry = Retrying() - retry_state = RetryCallState(retry, None, None, None) - attempt = AttemptManager(retry_state) - # Test the first URL that is returned (it should have the current unit IP). - url = patroni._get_alternative_patroni_url(attempt) + url = next(patroni._get_alternative_patroni_url()) assert url == f"http://{patroni.unit_ip}:8008" - # Test returning the other servers URLs. - for attempt_number in range(attempt.retry_state.attempt_number + 1, len(peers_ips) + 2): - attempt.retry_state.attempt_number = attempt_number - url = patroni._get_alternative_patroni_url(attempt) - assert url.split("http://")[1].split(":8008")[0] in peers_ips + other_urls = list(patroni._get_alternative_patroni_url()) + for ip in list(peers_ips): + assert f"http://{ip}:8008" in other_urls + assert f"https://{ip}:8008" in other_urls + + # Test alternatives + assert [ + f"http://{patroni.unit_ip}:8008", + "http://a:8008", + "http://b:8008", + "http://c:8008", + f"https://{patroni.unit_ip}:8008", + "https://a:8008", + "https://b:8008", + "https://c:8008", + ] == list(patroni._get_alternative_patroni_url(["a", "b", "c"])) def test_get_member_ip(peers_ips, patroni): with ( patch("requests.get", side_effect=mocked_requests_get) as _get, - patch("charm.Patroni._get_alternative_patroni_url") as _get_alternative_patroni_url, + patch("charm.Patroni._patroni_url", new_callable=PropertyMock) as _patroni_url, ): # Test error on trying to get the member IP. - _get_alternative_patroni_url.side_effect = "http://server2" + _patroni_url.return_value = "http://server2" + patroni.unit_ip = "server2" + patroni.peers_ips = [] with pytest.raises(RetryError): patroni.get_member_ip(patroni.member_name) assert False # Test using an alternative Patroni URL. - _get_alternative_patroni_url.side_effect = [ - "http://server3", - "http://server2", - "http://server1", + _patroni_url.return_value = "http://server3" + patroni.unit_ip = "server3" + patroni.peers_ips = [ + "server2", + "server1", ] ip = patroni.get_member_ip(patroni.member_name) assert ip == "1.1.1.1" # Test using the current Patroni URL. - _get_alternative_patroni_url.side_effect = ["http://server1"] + _patroni_url.return_value = "http://server1" + patroni.unit_ip = "server1" + patroni.peers_ips = [] ip = patroni.get_member_ip(patroni.member_name) assert ip == "1.1.1.1" # Test when not having that specific member in the cluster. - _get_alternative_patroni_url.side_effect = ["http://server1"] ip = patroni.get_member_ip("other-member-name") assert ip is None @@ -184,30 +192,36 @@ def test_get_postgresql_version(peers_ips, patroni): def test_get_primary(peers_ips, patroni): with ( patch("requests.get", side_effect=mocked_requests_get) as _get, - patch("charm.Patroni._get_alternative_patroni_url") as _get_alternative_patroni_url, + patch("charm.Patroni._patroni_url", new_callable=PropertyMock) as _patroni_url, ): # Test error on trying to get the member IP. - _get_alternative_patroni_url.side_effect = "http://server2" + _patroni_url.return_value = "http://server2" + patroni.unit_ip = "server2" + patroni.peers_ips = [] with pytest.raises(RetryError): patroni.get_primary(patroni.member_name) assert False # Test using an alternative Patroni URL. - _get_alternative_patroni_url.side_effect = [ - "http://server3", - "http://server2", - "http://server1", + _patroni_url.return_value = "http://server3" + patroni.unit_ip = "server3" + patroni.peers_ips = [ + "server2", + "server1", ] primary = patroni.get_primary() assert primary == "postgresql-0" # Test using the current Patroni URL. - _get_alternative_patroni_url.side_effect = ["http://server1"] + _patroni_url.return_value = "http://server1" + patroni.unit_ip = "server1" + patroni.peers_ips = [] primary = patroni.get_primary() assert primary == "postgresql-0" # Test requesting the primary in the unit name pattern. - _get_alternative_patroni_url.side_effect = ["http://server1"] + patroni.unit_ip = "server1" + patroni.peers_ips = [] primary = patroni.get_primary(unit_name_pattern=True) assert primary == "postgresql/0" @@ -235,6 +249,7 @@ def test_is_replication_healthy(peers_ips, patroni): with ( patch("requests.get") as _get, patch("charm.Patroni.get_primary"), + patch("charm.Patroni.get_member_ip"), patch("cluster.stop_after_delay", return_value=stop_after_delay(0)), ): # Test when replication is healthy.