From a9a07aa12309d72d1bcfe4aeeb405d15958ed31d Mon Sep 17 00:00:00 2001 From: Mahesh Shetty Date: Tue, 3 Dec 2024 19:07:17 +0530 Subject: [PATCH 1/5] Add test for zone unaware app Signed-off-by: Mahesh Shetty --- .../disaster-recovery/sc_arbiter/test_zone_unaware_app.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py b/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py new file mode 100644 index 00000000000..e69de29bb2d From e9e14bba8962825bab51e8f3b55b99879d89677b Mon Sep 17 00:00:00 2001 From: Mahesh Shetty Date: Fri, 6 Dec 2024 00:29:18 +0530 Subject: [PATCH 2/5] Support zone unaware apps in zone shutdown scenario w and w/o fencing Signed-off-by: Mahesh Shetty --- ocs_ci/helpers/helpers.py | 87 ++++++ ocs_ci/ocs/constants.py | 13 + .../network/network-fence-class.yaml | 10 + ocs_ci/templates/network/network-fence.yaml | 14 + .../workloads/logwriter/cephfs.logreader.yaml | 1 + tests/conftest.py | 91 ++++-- .../sc_arbiter/test_zone_unaware_app.py | 281 ++++++++++++++++++ 7 files changed, 478 insertions(+), 19 deletions(-) create mode 100644 ocs_ci/templates/network/network-fence-class.yaml create mode 100644 ocs_ci/templates/network/network-fence.yaml diff --git a/ocs_ci/helpers/helpers.py b/ocs_ci/helpers/helpers.py index f30b1518479..039dfb488cd 100644 --- a/ocs_ci/helpers/helpers.py +++ b/ocs_ci/helpers/helpers.py @@ -38,6 +38,7 @@ query_nb_db_psql_version, ) +from ocs_ci.ocs.node import get_worker_nodes from ocs_ci.ocs import constants, defaults, node, ocp, exceptions from ocs_ci.ocs.exceptions import ( CommandFailed, @@ -5744,3 +5745,89 @@ def verify_reclaimspacecronjob_suspend_state_for_pvc(pvc_obj): logger.info(f"ReclaimSpace operation is enabled for PVC '{pvc_obj.name}'") return False + + +def get_rbd_daemonset_csi_addons_node_object(node): + """ + Gets rdb daemonset CSI addons node data + + Args: + node (str): Name of the node + + Returns: + dict: CSI addons node object info + + """ + namespace = config.ENV_DATA["cluster_namespace"] + csi_addons_node = OCP(kind=constants.CSI_ADDONS_NODE_KIND, namespace=namespace) + csi_addons_node_data = csi_addons_node.get( + resource_name=f"{node}-{namespace}-daemonset-csi-rbdplugin" + ) + return csi_addons_node_data + + +def create_network_fence_class(): + """ + Create NetworkFenceClass CR and verify Ips are populated + in respective CsiAddonsNode objects + + """ + + logger.info("Creating NetworkFenceClass") + network_fence_class_dict = templating.load_yaml(constants.NETWORK_FENCE_CLASS_CRD) + network_fence_class_obj = create_resource(**network_fence_class_dict) + if network_fence_class_obj.ocp.get( + resource_name=network_fence_class_obj.name, dont_raise=True + ): + logger.info( + f"NetworkFenceClass {network_fence_class_obj.name} created successfully" + ) + + logger.info("Verifying CsiAddonsNode object for CSI RBD daemonset") + all_nodes = get_worker_nodes() + + for node_name in all_nodes: + cidrs = get_rbd_daemonset_csi_addons_node_object(node_name)["status"][ + "networkFenceClientStatus" + ][0]["ClientDetails"][0]["cidrs"] + assert len(cidrs) == 1, "No cidrs are populated to CSI Addons node object" + logger.info(f"Cidr: {cidrs[0]} populated in {node_name} CSI addons node object") + + +def create_network_fence(node_name, cidr): + """ + Create NetworkFence for the node + + """ + logger.info("Creating NetworkFence") + network_fence_dict = templating.load_yaml(constants.NETWORK_FENCE_CRD) + network_fence_dict["metadata"]["name"] = node_name + network_fence_dict["spec"]["cidrs"][0] = cidr + network_fence_obj = create_resource(**network_fence_dict) + if network_fence_obj.ocp.get(resource_name=network_fence_obj.name, dont_raise=True): + logger.info( + f"NetworkFence {network_fence_obj.name} for node {node_name} created successfully" + ) + + +def unfence_node(node_name): + """ + Un-fence node + + Args: + node_name (str): Name of the node + + """ + + network_fence_obj = OCP( + kind=constants.NETWORK_FENCE, namespace=config.ENV_DATA["cluster_namespace"] + ) + network_fence_obj.patch( + resource_name=node_name, + params='{"spec":{"fenceState":"Unfenced"}}', + format_type="merge", + ) + assert ( + network_fence_obj.get(resource_name=node_name)["spec"]["fenceState"] != "Fenced" + ), f"{node_name} doesnt seem to be unfenced" + logger.info(f"Unfenced node {node_name} successfully!") \ No newline at end of file diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py index 1d3b673bc0c..e6bb7a97f99 100644 --- a/ocs_ci/ocs/constants.py +++ b/ocs_ci/ocs/constants.py @@ -241,6 +241,8 @@ IMAGECONTENTSOURCEPOLICY_KIND = "ImageContentSourcePolicy" NOOBAA_ACCOUNT = "NoobaaAccount" EXTERNAL_CLUSTER_SCRIPT_CONFIG = "rook-ceph-external-cluster-script-config" +NETWORK_FENCE_CLASS = "NetworkFenceClass" +NETWORK_FENCE = "NetworkFence" # Provisioners AWS_EFS_PROVISIONER = "openshift.org/aws-efs" @@ -302,6 +304,9 @@ RAM = "rss" VIRT = "vms" +ODF_NETWORK_FENCE_CLASS = "odf-networkfenceclass" +CSI_ADDONS_NODE_KIND = "CSIAddonsNode" + # cluster types MS_CONSUMER_TYPE = "consumer" MS_PROVIDER_TYPE = "provider" @@ -2425,6 +2430,12 @@ LOGWRITER_CEPHFS_WRITER = os.path.join(LOGWRITER_DIR, "cephfs.logwriter.yaml") LOGWRITER_STS_PATH = os.path.join(LOGWRITER_DIR, "logwriter.rbd.yaml") +# Network Fence CRDs +NETWORK_FENCE_CLASS_CRD = os.path.join( + TEMPLATE_DIR, "network", "network-fence-class.yaml" +) +NETWORK_FENCE_CRD = os.path.join(TEMPLATE_DIR, "network", "network-fence.yaml") + # MCG namespace constants MCG_NS_AWS_ENDPOINT = "https://s3.amazonaws.com" MCG_NS_AZURE_ENDPOINT = "https://blob.core.windows.net" @@ -3011,6 +3022,8 @@ f"{ARBITER_ZONE}{DATA_ZONE_1}-{DATA_ZONE_1}{DATA_ZONE_2}" ) +NODE_OUT_OF_SERVICE_TAINT = "node.kubernetes.io/out-of-service=nodeshutdown:NoExecute" + # Logwriter workload labels LOGWRITER_CEPHFS_LABEL = "app=logwriter-cephfs" diff --git a/ocs_ci/templates/network/network-fence-class.yaml b/ocs_ci/templates/network/network-fence-class.yaml new file mode 100644 index 00000000000..eb0d44d145b --- /dev/null +++ b/ocs_ci/templates/network/network-fence-class.yaml @@ -0,0 +1,10 @@ +apiVersion: csiaddons.openshift.io/v1alpha1 +kind: NetworkFenceClass +metadata: + name: odf-networkfenceclass +spec: + provisioner: openshift-storage.rbd.csi.ceph.com + parameters: + clusterID: openshift-storage + csiaddons.openshift.io/networkfence-secret-name: rook-csi-rbd-node + csiaddons.openshift.io/networkfence-secret-namespace: openshift-storage diff --git a/ocs_ci/templates/network/network-fence.yaml b/ocs_ci/templates/network/network-fence.yaml new file mode 100644 index 00000000000..dfbbcb7e45b --- /dev/null +++ b/ocs_ci/templates/network/network-fence.yaml @@ -0,0 +1,14 @@ +apiVersion: csiaddons.openshift.io/v1alpha1 +kind: NetworkFence +metadata: + name: +spec: + cidrs: + - + driver: openshift-storage.rbd.csi.ceph.com + fenceState: Fenced + parameters: + clusterID: openshift-storage + secret: + name: rook-csi-rbd-provisioner + namespace: openshift-storage diff --git a/ocs_ci/templates/workloads/logwriter/cephfs.logreader.yaml b/ocs_ci/templates/workloads/logwriter/cephfs.logreader.yaml index 2dacb6e5a30..537bb91b4a7 100644 --- a/ocs_ci/templates/workloads/logwriter/cephfs.logreader.yaml +++ b/ocs_ci/templates/workloads/logwriter/cephfs.logreader.yaml @@ -7,6 +7,7 @@ metadata: spec: completions: 6 parallelism: 6 + backoffLimit: 10 completionMode: Indexed template: metadata: diff --git a/tests/conftest.py b/tests/conftest.py index f2abdfbd44b..693fc2d0415 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -186,6 +186,7 @@ get_current_test_name, modify_deployment_replica_count, modify_statefulset_replica_count, + create_network_fence_class, ) from ocs_ci.ocs.ceph_debug import CephObjectStoreTool, MonStoreTool, RookCephPlugin from ocs_ci.ocs.bucket_utils import get_rgw_restart_counts @@ -7910,7 +7911,7 @@ def setup_logwriter_workload(request, teardown_factory): """ - def factory(pvc, logwriter_path): + def factory(pvc, logwriter_path, zone_aware=True): """ Args: pvc (PVC): PVC object @@ -7929,6 +7930,10 @@ def factory(pvc, logwriter_path): dc_data["spec"]["template"]["spec"]["volumes"][0]["persistentVolumeClaim"][ "claimName" ] = pvc.name + + if not zone_aware: + dc_data["spec"]["template"]["spec"].pop("topologySpreadConstraints") + logwriter_dc = helpers.create_resource(**dc_data) teardown_factory(logwriter_dc) @@ -7963,7 +7968,7 @@ def logreader_workload_factory(request, teardown_factory): def setup_logreader_workload(request, teardown_factory): - def factory(pvc, logreader_path, duration=30): + def factory(pvc, logreader_path, duration=30, zone_aware=True): """ Args: pvc (PVC): PVC object @@ -7988,6 +7993,10 @@ def factory(pvc, logreader_path, duration=30): job_data["spec"]["template"]["spec"]["containers"][0]["command"][ 2 ] = f"/opt/logreader.py -t {duration} *.log -d" + + if not zone_aware: + job_data["spec"]["template"]["spec"].pop("topologySpreadConstraints") + logreader_job = helpers.create_resource(**job_data) teardown_factory(logreader_job) @@ -8063,7 +8072,7 @@ def setup_logwriter_cephfs_workload( """ - def factory(read_duration=30): + def factory(read_duration=30, **kwargs): """ Args: read_duration (int): Time duration in minutes @@ -8078,10 +8087,10 @@ def factory(read_duration=30): project_name=setup_stretch_cluster_project ) logwriter_workload = logwriter_workload_factory( - pvc=pvc, logwriter_path=logwriter_path + pvc=pvc, logwriter_path=logwriter_path, **kwargs ) logreader_workload = logreader_workload_factory( - pvc=pvc, logreader_path=logreader_path, duration=read_duration + pvc=pvc, logreader_path=logreader_path, duration=read_duration, **kwargs ) return logwriter_workload, logreader_workload @@ -8117,22 +8126,31 @@ def setup_logwriter_rbd_workload( """ - logwriter_sts_path = constants.LOGWRITER_STS_PATH - sts_data = templating.load_yaml(logwriter_sts_path) - sts_data["metadata"]["namespace"] = setup_stretch_cluster_project.namespace - logwriter_sts = helpers.create_resource(**sts_data) - teardown_factory(logwriter_sts) - logwriter_sts_pods = [ - pod["metadata"]["name"] - for pod in get_pods_having_label( - label="app=logwriter-rbd", namespace=setup_stretch_cluster_project.namespace + def factory(zone_aware=True): + + logwriter_sts_path = constants.LOGWRITER_STS_PATH + sts_data = templating.load_yaml(logwriter_sts_path) + sts_data["metadata"]["namespace"] = setup_stretch_cluster_project.namespace + if not zone_aware: + sts_data["spec"]["template"]["spec"].pop("topologySpreadConstraints") + + logwriter_sts = helpers.create_resource(**sts_data) + teardown_factory(logwriter_sts) + logwriter_sts_pods = [ + pod["metadata"]["name"] + for pod in get_pods_having_label( + label="app=logwriter-rbd", + namespace=setup_stretch_cluster_project.namespace, + ) + ] + wait_for_pods_to_be_running( + namespace=setup_stretch_cluster_project.namespace, + pod_names=logwriter_sts_pods, ) - ] - wait_for_pods_to_be_running( - namespace=setup_stretch_cluster_project.namespace, pod_names=logwriter_sts_pods - ) - return logwriter_sts + return logwriter_sts + + return factory @pytest.fixture() @@ -9082,3 +9100,38 @@ def teardown(): # Add the teardown function to the request's finalizer request.addfinalizer(teardown) + + +@pytest.fixture(scope="session") +def setup_network_fence_class(request): + """ + Setup NetworkFenceClass CRD for ODF if not present + + """ + try: + network_fence_class = OCP( + kind=constants.NETWORK_FENCE_CLASS, + namespace=ocsci_config.ENV_DATA["cluster_namespace"], + resource_name=constants.ODF_NETWORK_FENCE_CLASS, + ) + created_by_fixture = False + if not network_fence_class.get(dont_raise=True): + create_network_fence_class() + created_by_fixture = True + else: + log.info( + f"NetworkFenceClass {network_fence_class.resource_name} already exists!" + ) + finally: + + def finalizer(): + """ + Delete the NFC CRD if created by fixture + + """ + if created_by_fixture: + network_fence_class.delete( + resource_name=constants.ODF_NETWORK_FENCE_CLASS + ) + + request.addfinalizer(finalizer) diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py b/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py index e69de29bb2d..6899f270f53 100644 --- a/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py +++ b/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py @@ -0,0 +1,281 @@ +import logging +import pytest +import time + +from ocs_ci.helpers.sanity_helpers import Sanity +from ocs_ci.ocs.node import taint_nodes, get_nodes, get_worker_nodes +from ocs_ci.helpers.helpers import ( + create_network_fence, + get_rbd_daemonset_csi_addons_node_object, + unfence_node, +) +from ocs_ci.helpers.stretchcluster_helper import recover_workload_pods_post_recovery +from ocs_ci.ocs import constants +from ocs_ci.ocs.exceptions import ( + UnexpectedBehaviour, + CommandFailed, + ResourceWrongStatusException, + CephHealthException, +) +from ocs_ci.ocs.node import wait_for_nodes_status +from ocs_ci.ocs.resources.pod import ( + get_not_running_pods, + get_pods_having_label, + Pod, + get_ceph_tools_pod, +) +from ocs_ci.ocs.resources.stretchcluster import StretchCluster +from ocs_ci.utility.retry import retry + +log = logging.getLogger(__name__) + + +class TestZoneUnawareApps: + + def check_for_logwriter_workload_pods( + self, + sc_obj, + ): + + try: + sc_obj.get_logwriter_reader_pods(label=constants.LOGWRITER_CEPHFS_LABEL) + sc_obj.get_logwriter_reader_pods( + label=constants.LOGREADER_CEPHFS_LABEL, + statuses=[constants.STATUS_RUNNING, constants.STATUS_COMPLETED], + ) + sc_obj.get_logwriter_reader_pods( + label=constants.LOGWRITER_RBD_LABEL, exp_num_replicas=2 + ) + except UnexpectedBehaviour: + + log.info("some pods are not running, so trying the work-around") + pods_not_running = get_not_running_pods( + namespace=constants.STRETCH_CLUSTER_NAMESPACE + ) + recover_workload_pods_post_recovery(sc_obj, pods_not_running) + log.info("All the workloads pods are successfully up and running") + + @pytest.fixture() + def init_sanity(self, request, nodes): + """ + Initial Cluster sanity + """ + self.sanity_helpers = Sanity() + + def finalizer(): + """ + Make sure all the nodes are Running and + the ceph health is OK at the end of the test + """ + + # check if all the nodes are Running + log.info("Checking if all the nodes are READY") + master_nodes = get_nodes(node_type=constants.MASTER_MACHINE) + worker_nodes = get_nodes(node_type=constants.WORKER_MACHINE) + nodes_not_ready = list() + nodes_not_ready.extend( + [node for node in worker_nodes if node.status() != "Ready"] + ) + nodes_not_ready.extend( + [node for node in master_nodes if node.status() != "Ready"] + ) + + if len(nodes_not_ready) != 0: + try: + nodes.start_nodes(nodes=nodes_not_ready) + except Exception: + log.error( + f"Something went wrong while starting the nodes {nodes_not_ready}!" + ) + raise + + retry( + ( + CommandFailed, + TimeoutError, + AssertionError, + ResourceWrongStatusException, + ), + tries=30, + delay=15, + )(wait_for_nodes_status(timeout=1800)) + log.info( + f"Following nodes {nodes_not_ready} were NOT READY, are now in READY state" + ) + else: + log.info("All nodes are READY") + + # check cluster health + try: + log.info("Making sure ceph health is OK") + self.sanity_helpers.health_check(tries=50, cluster_check=False) + except CephHealthException as e: + assert ( + "HEALTH_WARN" in e.args[0] + ), f"Ignoring Ceph health warnings: {e.args[0]}" + get_ceph_tools_pod().exec_ceph_cmd(ceph_cmd="ceph crash archive-all") + log.info("Archived ceph crash!") + + request.addfinalizer(finalizer) + + @pytest.mark.parametrize( + argnames="fencing", + argvalues=[ + pytest.param( + True, + ), + # pytest.param( + # False, + # ) + ], + ids=[ + "With-Fencing", + # "Without-Fencing", + ], + ) + def test_zone_shutdowns( + self, + init_sanity, + setup_logwriter_cephfs_workload_factory, + setup_logwriter_rbd_workload_factory, + setup_network_fence_class, + nodes, + fencing, + ): + + sc_obj = StretchCluster() + + # fetch all workload details once they're deployed + ( + sc_obj.cephfs_logwriter_dep, + sc_obj.cephfs_logreader_job, + ) = setup_logwriter_cephfs_workload_factory(read_duration=0, zone_aware=False) + + sc_obj.rbd_logwriter_sts = setup_logwriter_rbd_workload_factory( + zone_aware=False + ) + + # get all worker nodes + worker_nodes = get_worker_nodes() + + for zone in constants.DATA_ZONE_LABELS: + self.check_for_logwriter_workload_pods(sc_obj) + log.info("Both logwriter CephFS and RBD workloads are in healthy state") + + log.info( + "Fetching the logfile details for future detection of data loss and data corruption" + ) + sc_obj.get_logfile_map(label=constants.LOGWRITER_CEPHFS_LABEL) + sc_obj.get_logfile_map(label=constants.LOGWRITER_RBD_LABEL) + + nodes_to_shutdown = sc_obj.get_nodes_in_zone(zone) + nodes.stop_nodes(nodes=nodes_to_shutdown) + wait_for_nodes_status( + node_names=[node.name for node in nodes_to_shutdown], + status=constants.NODE_NOT_READY, + timeout=300, + ) + log.info(f"Nodes of zone {zone} are shutdown successfully") + + if fencing: + log.info( + "Since fencing is enabled, we need to fence the nodes after zone shutdown" + ) + for node in nodes_to_shutdown: + if node.name not in worker_nodes: + continue + cidrs = retry(CommandFailed, tries=5)( + get_rbd_daemonset_csi_addons_node_object + )(node.name)["status"]["networkFenceClientStatus"][0][ + "ClientDetails" + ][ + 0 + ][ + "cidrs" + ] + retry(CommandFailed, tries=5)(create_network_fence)( + node.name, cidr=cidrs[0] + ) + + taint_nodes( + nodes=[node.name for node in nodes_to_shutdown], + taint_label=constants.NODE_OUT_OF_SERVICE_TAINT, + ) + + log.info("Wait until the pod relocation buffer time of 10 minutes") + time.sleep(600) + + log.info( + "Checking if all the logwriter/logreader pods are relocated and successfully running" + ) + sc_obj.get_logwriter_reader_pods(label=constants.LOGWRITER_CEPHFS_LABEL) + sc_obj.get_logwriter_reader_pods( + label=constants.LOGREADER_CEPHFS_LABEL, + statuses=[constants.STATUS_RUNNING, constants.STATUS_COMPLETED], + ) + try: + retry(UnexpectedBehaviour, tries=1)(sc_obj.get_logwriter_reader_pods)( + label=constants.LOGWRITER_RBD_LABEL, exp_num_replicas=2 + ) + except UnexpectedBehaviour: + if not fencing: + log.info( + "It is expected for RBD workload with RWO to stuck in terminating state" + ) + log.info("Trying the workaround now...") + pods_terminating = [ + Pod(**pod_info) + for pod_info in get_pods_having_label( + label=constants.LOGWRITER_RBD_LABEL, + statuses=[constants.STATUS_TERMINATING], + ) + ] + for pod in pods_terminating: + log.info(f"Force deleting the pod {pod.name}") + pod.delete(force=True) + sc_obj.get_logwriter_reader_pods( + label=constants.LOGWRITER_RBD_LABEL, exp_num_replicas=2 + ) + else: + log.error( + "Looks like pods are not running or relocated even after fencing.. please check" + ) + raise + + if fencing: + log.info( + "If fencing was done, then we need to unfence the nodes once the pods are relocated and running" + ) + for node in nodes_to_shutdown: + if node.name not in worker_nodes: + continue + unfence_node(node.name) + taint_nodes( + nodes=[node.name for node in nodes_to_shutdown], + taint_label=f"{constants.NODE_OUT_OF_SERVICE_TAINT}-", + ) + log.info("Successfully removed taints") + + log.info(f"Starting the {zone} nodes now...") + # start the nodes + try: + nodes.start_nodes(nodes=nodes_to_shutdown) + except Exception: + log.error("Something went wrong while starting the nodes!") + raise + + # Validate all nodes are in READY state and up + retry( + ( + CommandFailed, + TimeoutError, + AssertionError, + ResourceWrongStatusException, + ), + tries=30, + delay=15, + )(wait_for_nodes_status(timeout=1800)) + log.info(f"Nodes of zone {zone} are started successfully") + + self.check_for_logwriter_workload_pods(sc_obj) + log.info("All logwriter workload pods are running!") From e8911ea0ebb78d3a4b056e1df2688f88c709e1d3 Mon Sep 17 00:00:00 2001 From: Mahesh Shetty Date: Fri, 3 Jan 2025 18:58:37 +0530 Subject: [PATCH 3/5] Vefify data integrity post recovery Signed-off-by: Mahesh Shetty --- .../sc_arbiter/test_zone_unaware_app.py | 145 +++++++++++++----- 1 file changed, 105 insertions(+), 40 deletions(-) diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py b/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py index 6899f270f53..2951ede559a 100644 --- a/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py +++ b/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py @@ -2,6 +2,13 @@ import pytest import time +from datetime import datetime, timezone + +from ocs_ci.framework.pytest_customization.marks import ( + stretchcluster_required, + tier1, + turquoise_squad, +) from ocs_ci.helpers.sanity_helpers import Sanity from ocs_ci.ocs.node import taint_nodes, get_nodes, get_worker_nodes from ocs_ci.helpers.helpers import ( @@ -9,7 +16,7 @@ get_rbd_daemonset_csi_addons_node_object, unfence_node, ) -from ocs_ci.helpers.stretchcluster_helper import recover_workload_pods_post_recovery +from ocs_ci.helpers.stretchcluster_helper import check_for_logwriter_workload_pods from ocs_ci.ocs import constants from ocs_ci.ocs.exceptions import ( UnexpectedBehaviour, @@ -19,42 +26,23 @@ ) from ocs_ci.ocs.node import wait_for_nodes_status from ocs_ci.ocs.resources.pod import ( - get_not_running_pods, get_pods_having_label, Pod, get_ceph_tools_pod, + wait_for_pods_to_be_in_statuses, ) +from ocs_ci.ocs.resources.pvc import get_pvc_objs from ocs_ci.ocs.resources.stretchcluster import StretchCluster from ocs_ci.utility.retry import retry log = logging.getLogger(__name__) +@tier1 +@stretchcluster_required +@turquoise_squad class TestZoneUnawareApps: - def check_for_logwriter_workload_pods( - self, - sc_obj, - ): - - try: - sc_obj.get_logwriter_reader_pods(label=constants.LOGWRITER_CEPHFS_LABEL) - sc_obj.get_logwriter_reader_pods( - label=constants.LOGREADER_CEPHFS_LABEL, - statuses=[constants.STATUS_RUNNING, constants.STATUS_COMPLETED], - ) - sc_obj.get_logwriter_reader_pods( - label=constants.LOGWRITER_RBD_LABEL, exp_num_replicas=2 - ) - except UnexpectedBehaviour: - - log.info("some pods are not running, so trying the work-around") - pods_not_running = get_not_running_pods( - namespace=constants.STRETCH_CLUSTER_NAMESPACE - ) - recover_workload_pods_post_recovery(sc_obj, pods_not_running) - log.info("All the workloads pods are successfully up and running") - @pytest.fixture() def init_sanity(self, request, nodes): """ @@ -124,13 +112,13 @@ def finalizer(): pytest.param( True, ), - # pytest.param( - # False, - # ) + pytest.param( + False, + ), ], ids=[ "With-Fencing", - # "Without-Fencing", + "Without-Fencing", ], ) def test_zone_shutdowns( @@ -138,6 +126,7 @@ def test_zone_shutdowns( init_sanity, setup_logwriter_cephfs_workload_factory, setup_logwriter_rbd_workload_factory, + logreader_workload_factory, setup_network_fence_class, nodes, fencing, @@ -145,7 +134,7 @@ def test_zone_shutdowns( sc_obj = StretchCluster() - # fetch all workload details once they're deployed + # Deploy the zone un-aware logwriter workloads ( sc_obj.cephfs_logwriter_dep, sc_obj.cephfs_logreader_job, @@ -155,19 +144,24 @@ def test_zone_shutdowns( zone_aware=False ) - # get all worker nodes + # Fetch all the worker node names worker_nodes = get_worker_nodes() for zone in constants.DATA_ZONE_LABELS: - self.check_for_logwriter_workload_pods(sc_obj) + + # Make sure logwriter workload pods are running + check_for_logwriter_workload_pods(sc_obj, nodes=nodes) log.info("Both logwriter CephFS and RBD workloads are in healthy state") - log.info( - "Fetching the logfile details for future detection of data loss and data corruption" - ) + # Fetch logfile details to verify data integrity post recovery sc_obj.get_logfile_map(label=constants.LOGWRITER_CEPHFS_LABEL) sc_obj.get_logfile_map(label=constants.LOGWRITER_RBD_LABEL) + log.info( + "Fetched the logfile details for data integrity verification post recovery" + ) + # Shutdown the nodes + start_time = datetime.now(timezone.utc) nodes_to_shutdown = sc_obj.get_nodes_in_zone(zone) nodes.stop_nodes(nodes=nodes_to_shutdown) wait_for_nodes_status( @@ -178,12 +172,18 @@ def test_zone_shutdowns( log.info(f"Nodes of zone {zone} are shutdown successfully") if fencing: + + # If fencing is True, then we need to fence the nodes after shutdown log.info( "Since fencing is enabled, we need to fence the nodes after zone shutdown" ) for node in nodes_to_shutdown: + + # Ignore the master nodes if node.name not in worker_nodes: continue + + # Fetch the cidrs for creating network fence cidrs = retry(CommandFailed, tries=5)( get_rbd_daemonset_csi_addons_node_object )(node.name)["status"]["networkFenceClientStatus"][0][ @@ -193,18 +193,23 @@ def test_zone_shutdowns( ][ "cidrs" ] + + # Create the network fence retry(CommandFailed, tries=5)(create_network_fence)( node.name, cidr=cidrs[0] ) + # Taint the nodes that are shutdown taint_nodes( nodes=[node.name for node in nodes_to_shutdown], taint_label=constants.NODE_OUT_OF_SERVICE_TAINT, ) + # Wait for the buffer time of pod relocation log.info("Wait until the pod relocation buffer time of 10 minutes") time.sleep(600) + # Check if all the pods are running log.info( "Checking if all the logwriter/logreader pods are relocated and successfully running" ) @@ -238,11 +243,13 @@ def test_zone_shutdowns( ) else: log.error( - "Looks like pods are not running or relocated even after fencing.. please check" + "Looks like pods are not running or not relocated even after fencing.. please check" ) raise if fencing: + + # If fencing is True, then unfence the nodes once the pods are relocated log.info( "If fencing was done, then we need to unfence the nodes once the pods are relocated and running" ) @@ -250,14 +257,18 @@ def test_zone_shutdowns( if node.name not in worker_nodes: continue unfence_node(node.name) + + # Remove the taints from the nodes that were shutdown taint_nodes( nodes=[node.name for node in nodes_to_shutdown], taint_label=f"{constants.NODE_OUT_OF_SERVICE_TAINT}-", ) - log.info("Successfully removed taints") + log.info( + "Successfully removed taints from the nodes that were shutdown" + ) - log.info(f"Starting the {zone} nodes now...") - # start the nodes + # Start the nodes that were shutdown + log.info(f"Starting the {zone} nodes") try: nodes.start_nodes(nodes=nodes_to_shutdown) except Exception: @@ -275,7 +286,61 @@ def test_zone_shutdowns( tries=30, delay=15, )(wait_for_nodes_status(timeout=1800)) + end_time = datetime.now(timezone.utc) log.info(f"Nodes of zone {zone} are started successfully") - self.check_for_logwriter_workload_pods(sc_obj) + # Verify logwriter workload IO post recovery + sc_obj.post_failure_checks( + start_time, end_time, wait_for_read_completion=False + ) + log.info("Successfully verified with post failure checks for the workloads") + + # Make sure all the logwriter pods are running + check_for_logwriter_workload_pods(sc_obj, nodes=nodes) log.info("All logwriter workload pods are running!") + + # check for any data loss through logwriter logs + assert sc_obj.check_for_data_loss( + constants.LOGWRITER_CEPHFS_LABEL + ), "[CephFS] Data is lost" + log.info("[CephFS] No data loss is seen") + assert sc_obj.check_for_data_loss( + constants.LOGWRITER_RBD_LABEL + ), "[RBD] Data is lost" + log.info("[RBD] No data loss is seen") + + # check for data corruption through logreader logs + sc_obj.cephfs_logreader_job.delete() + for pod in sc_obj.cephfs_logreader_pods: + pod.wait_for_pod_delete(timeout=120) + log.info("All old CephFS logreader pods are deleted") + pvc = get_pvc_objs( + pvc_names=[ + sc_obj.cephfs_logwriter_dep.get()["spec"]["template"]["spec"][ + "volumes" + ][0]["persistentVolumeClaim"]["claimName"] + ], + namespace=constants.STRETCH_CLUSTER_NAMESPACE, + )[0] + logreader_workload_factory( + pvc=pvc, logreader_path=constants.LOGWRITER_CEPHFS_READER, duration=5 + ) + sc_obj.get_logwriter_reader_pods(constants.LOGREADER_CEPHFS_LABEL) + + wait_for_pods_to_be_in_statuses( + expected_statuses=constants.STATUS_COMPLETED, + pod_names=[pod.name for pod in sc_obj.cephfs_logreader_pods], + timeout=900, + namespace=constants.STRETCH_CLUSTER_NAMESPACE, + ) + log.info("[CephFS] Logreader job pods have reached 'Completed' state!") + + assert sc_obj.check_for_data_corruption( + label=constants.LOGREADER_CEPHFS_LABEL + ), "Data is corrupted for cephFS workloads" + log.info("No data corruption is seen in CephFS workloads") + + assert sc_obj.check_for_data_corruption( + label=constants.LOGWRITER_RBD_LABEL + ), "Data is corrupted for RBD workloads" + log.info("No data corruption is seen in RBD workloads") From 6f57c81acb0e3e24e09f163b7769362259f1b7d3 Mon Sep 17 00:00:00 2001 From: Mahesh Shetty Date: Mon, 6 Jan 2025 12:57:15 +0530 Subject: [PATCH 4/5] fix up Signed-off-by: Mahesh Shetty --- ocs_ci/helpers/helpers.py | 8 ++++++++ .../sc_arbiter/test_zone_unaware_app.py | 19 +++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/ocs_ci/helpers/helpers.py b/ocs_ci/helpers/helpers.py index 039dfb488cd..c8200cd273e 100644 --- a/ocs_ci/helpers/helpers.py +++ b/ocs_ci/helpers/helpers.py @@ -5798,6 +5798,13 @@ def create_network_fence(node_name, cidr): """ Create NetworkFence for the node + Args: + node_name (str): Name of the node + cidr (str): cidr + + Returns: + OCS: NetworkFence object + """ logger.info("Creating NetworkFence") network_fence_dict = templating.load_yaml(constants.NETWORK_FENCE_CRD) @@ -5808,6 +5815,7 @@ def create_network_fence(node_name, cidr): logger.info( f"NetworkFence {network_fence_obj.name} for node {node_name} created successfully" ) + return network_fence_obj def unfence_node(node_name): diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py b/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py index 2951ede559a..f02a9360718 100644 --- a/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py +++ b/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py @@ -109,15 +109,15 @@ def finalizer(): @pytest.mark.parametrize( argnames="fencing", argvalues=[ - pytest.param( - True, - ), + # pytest.param( + # True, + # ), pytest.param( False, ), ], ids=[ - "With-Fencing", + # "With-Fencing", "Without-Fencing", ], ) @@ -171,6 +171,7 @@ def test_zone_shutdowns( ) log.info(f"Nodes of zone {zone} are shutdown successfully") + network_fence_objs = [] if fencing: # If fencing is True, then we need to fence the nodes after shutdown @@ -195,8 +196,10 @@ def test_zone_shutdowns( ] # Create the network fence - retry(CommandFailed, tries=5)(create_network_fence)( - node.name, cidr=cidrs[0] + network_fence_objs.append( + retry(CommandFailed, tries=5)(create_network_fence)( + node.name, cidr=cidrs[0] + ) ) # Taint the nodes that are shutdown @@ -267,6 +270,10 @@ def test_zone_shutdowns( "Successfully removed taints from the nodes that were shutdown" ) + # Delete NetworkFence objects for each of the nodes + for network_fence in network_fence_objs: + network_fence.delete() + # Start the nodes that were shutdown log.info(f"Starting the {zone} nodes") try: From c38203c159df8f6bbd71226b091b63e7ca4d751c Mon Sep 17 00:00:00 2001 From: Mahesh Shetty Date: Fri, 24 Jan 2025 13:42:47 +0530 Subject: [PATCH 5/5] Fix up Signed-off-by: Mahesh Shetty --- ocs_ci/helpers/helpers.py | 29 +++++++++++------- ocs_ci/ocs/resources/stretchcluster.py | 2 +- .../sc_arbiter/test_zone_unaware_app.py | 30 ++++++++++++------- 3 files changed, 39 insertions(+), 22 deletions(-) diff --git a/ocs_ci/helpers/helpers.py b/ocs_ci/helpers/helpers.py index c8200cd273e..f090b02c33c 100644 --- a/ocs_ci/helpers/helpers.py +++ b/ocs_ci/helpers/helpers.py @@ -5818,24 +5818,33 @@ def create_network_fence(node_name, cidr): return network_fence_obj -def unfence_node(node_name): +def unfence_node(node_name, delete=False): """ Un-fence node Args: node_name (str): Name of the node + delete (bool): If True, delete the network fence object """ network_fence_obj = OCP( kind=constants.NETWORK_FENCE, namespace=config.ENV_DATA["cluster_namespace"] ) - network_fence_obj.patch( - resource_name=node_name, - params='{"spec":{"fenceState":"Unfenced"}}', - format_type="merge", - ) - assert ( - network_fence_obj.get(resource_name=node_name)["spec"]["fenceState"] != "Fenced" - ), f"{node_name} doesnt seem to be unfenced" - logger.info(f"Unfenced node {node_name} successfully!") \ No newline at end of file + if network_fence_obj.get(resource_name=node_name, dont_raise=True): + network_fence_obj.patch( + resource_name=node_name, + params='{"spec":{"fenceState":"Unfenced"}}', + format_type="merge", + ) + assert ( + network_fence_obj.get(resource_name=node_name)["spec"]["fenceState"] + != "Fenced" + ), f"{node_name} doesnt seem to be unfenced" + logger.info(f"Unfenced node {node_name} successfully!") + + if delete: + network_fence_obj.delete() + logger.info(f"Deleted network fence object for node {node_name}") + else: + logger.info(f"No networkfence found for node {node_name}") diff --git a/ocs_ci/ocs/resources/stretchcluster.py b/ocs_ci/ocs/resources/stretchcluster.py index 5a204e7ce94..68c86aa446f 100644 --- a/ocs_ci/ocs/resources/stretchcluster.py +++ b/ocs_ci/ocs/resources/stretchcluster.py @@ -319,7 +319,7 @@ def get_logfile_map(self, label): self.logfile_map[label][0] = list(set(self.logfile_map[label][0])) logger.info(self.logfile_map[label][0]) - @retry(UnexpectedBehaviour, tries=6, delay=5) + @retry(UnexpectedBehaviour, tries=8, delay=5) def get_logwriter_reader_pods( self, label, diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py b/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py index f02a9360718..c66104c63e4 100644 --- a/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py +++ b/tests/functional/disaster-recovery/sc_arbiter/test_zone_unaware_app.py @@ -2,8 +2,6 @@ import pytest import time -from datetime import datetime, timezone - from ocs_ci.framework.pytest_customization.marks import ( stretchcluster_required, tier1, @@ -30,6 +28,7 @@ Pod, get_ceph_tools_pod, wait_for_pods_to_be_in_statuses, + logger, ) from ocs_ci.ocs.resources.pvc import get_pvc_objs from ocs_ci.ocs.resources.stretchcluster import StretchCluster @@ -106,6 +105,22 @@ def finalizer(): request.addfinalizer(finalizer) + @pytest.fixture(autouse=True) + def unfence_teardown(self, request): + """ + In case of failure in between test run unfence the networkfence + and delete the NetworkFence objects + + """ + + def teardown(): + all_worker_nodes = get_worker_nodes() + for node_name in all_worker_nodes: + unfence_node(node_name, delete=True) + logger.info("cleaned up all network fence objects if any") + + request.addfinalizer(teardown) + @pytest.mark.parametrize( argnames="fencing", argvalues=[ @@ -161,7 +176,6 @@ def test_zone_shutdowns( ) # Shutdown the nodes - start_time = datetime.now(timezone.utc) nodes_to_shutdown = sc_obj.get_nodes_in_zone(zone) nodes.stop_nodes(nodes=nodes_to_shutdown) wait_for_nodes_status( @@ -235,9 +249,10 @@ def test_zone_shutdowns( Pod(**pod_info) for pod_info in get_pods_having_label( label=constants.LOGWRITER_RBD_LABEL, - statuses=[constants.STATUS_TERMINATING], + namespace=constants.STRETCH_CLUSTER_NAMESPACE, ) ] + log.info(pods_terminating) for pod in pods_terminating: log.info(f"Force deleting the pod {pod.name}") pod.delete(force=True) @@ -293,15 +308,8 @@ def test_zone_shutdowns( tries=30, delay=15, )(wait_for_nodes_status(timeout=1800)) - end_time = datetime.now(timezone.utc) log.info(f"Nodes of zone {zone} are started successfully") - # Verify logwriter workload IO post recovery - sc_obj.post_failure_checks( - start_time, end_time, wait_for_read_completion=False - ) - log.info("Successfully verified with post failure checks for the workloads") - # Make sure all the logwriter pods are running check_for_logwriter_workload_pods(sc_obj, nodes=nodes) log.info("All logwriter workload pods are running!")