From 84a914be6768aa0f9278d20625c960a8761514b2 Mon Sep 17 00:00:00 2001 From: Mahesh Shetty Date: Thu, 29 Aug 2024 16:48:03 +0530 Subject: [PATCH] run workloads with device replacement Signed-off-by: Mahesh Shetty --- .../sc_arbiter/test_device_replacement.py | 136 +++++++++++------- 1 file changed, 85 insertions(+), 51 deletions(-) diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_device_replacement.py b/tests/functional/disaster-recovery/sc_arbiter/test_device_replacement.py index 1e73daee1816..99f528d3e0c2 100644 --- a/tests/functional/disaster-recovery/sc_arbiter/test_device_replacement.py +++ b/tests/functional/disaster-recovery/sc_arbiter/test_device_replacement.py @@ -1,64 +1,98 @@ -# import random import logging +from datetime import datetime, timezone + +from ocs_ci.ocs import constants +from ocs_ci.ocs.resources.pod import wait_for_pods_to_be_in_statuses -# from ocs_ci.ocs import constants -# from ocs_ci.ocs.resources.pv import get_pv_in_status -# from ocs_ci.ocs.resources.deployment import get_osd_deployments -# from ocs_ci.helpers.helpers import modify_deployment_replica_count -# from ocs_ci.ocs.resources.pod import ( -# wait_for_pods_by_label_count, -# delete_all_osd_removal_jobs, -# run_osd_removal_job, -# verify_osd_removal_job_completed_successfully, -# ) from ocs_ci.ocs.osd_operations import osd_device_replacement +from ocs_ci.ocs.resources.stretchcluster import StretchCluster logger = logging.getLogger(__name__) class TestDeviceReplacementInStretchCluster: - def test_device_replacement(self, nodes): + def test_device_replacement( + self, nodes, setup_logwriter_cephfs_workload_factory, logreader_workload_factory + ): """ - Test device replacement in stretch cluster + Test device replacement in stretch cluster while logwriter workload + for both CephFs and RBD is running + + Steps: + 1) Run logwriter/reader workload for both CephFs and RBD volumes + 2) Perform device replacement procedure + 3) Verify no data loss + 4) Verify no data corruption """ + + sc_obj = StretchCluster() + + # setup logwriter workloads in the background + ( + sc_obj.cephfs_logwriter_dep, + sc_obj.cephfs_logreader_job, + ) = setup_logwriter_cephfs_workload_factory(read_duration=0) + + sc_obj.get_logwriter_reader_pods(label=constants.LOGWRITER_CEPHFS_LABEL) + sc_obj.get_logwriter_reader_pods(label=constants.LOGREADER_CEPHFS_LABEL) + sc_obj.get_logwriter_reader_pods( + label=constants.LOGWRITER_RBD_LABEL, exp_num_replicas=2 + ) + logger.info("All the workloads pods are successfully up and running") + + start_time = datetime.now(timezone.utc) + + sc_obj.get_logfile_map(label=constants.LOGWRITER_CEPHFS_LABEL) + sc_obj.get_logfile_map(label=constants.LOGWRITER_RBD_LABEL) + + # run device replacement procedure + logger.info("Running device replacement procedure now") osd_device_replacement(nodes) - # # Choose the OSD that needs to be replaced - # all_osd_deployments = get_osd_deployments() - # osd_deployment = random.choice(all_osd_deployments) - # osd_id = osd_deployment.get()["metadata"]["labels"]["ceph-osd-id"] - # logger.info(f"osd-{osd_id} needs to be removed") - # - # # Scale down the osd deployment - # logger.info(f"scaling down {osd_deployment.name} now...") - # modify_deployment_replica_count( - # osd_deployment.name, - # replica_count=0, - # ) - # - # # Verify that osd pods are terminated - # wait_for_pods_by_label_count( - # f"ceph-osd-id={osd_id}", - # exptected_count=0, - # ) - # logger.info("osd pods are terminated successfully") - # - # # Remove the osd from the cluster - # # to add new osd - # delete_all_osd_removal_jobs() - # run_osd_removal_job(osd_ids=[osd_id]) - # - # # Verify that OSDs are removed successfully - # verify_osd_removal_job_completed_successfully(osd_id) - # - # # Find the persistent volume (PV) that need to be deleted and delete it - # pvs = get_pv_in_status( - # storage_class=constants.LOCALSTORAGE_SC, status=constants.STATUS_RELEASED - # ) - # for pv in pvs: - # pv.delete() - - # Track the provisioning of PVs for the devices that match the deviceInclusionSpec - - # Delete the ocs-osd-removal job(s). + # check Io for any failures + end_time = datetime.now(timezone.utc) + sc_obj.post_failure_checks(start_time, end_time, wait_for_read_completion=False) + logger.info("Successfully verified with post failure checks for the workloads") + + sc_obj.cephfs_logreader_job.delete() + logger.info(sc_obj.cephfs_logreader_pods) + for pod in sc_obj.cephfs_logreader_pods: + pod.wait_for_pod_delete(timeout=120) + logger.info("All old CephFS logreader pods are deleted") + + # check for any data loss + assert sc_obj.check_for_data_loss( + constants.LOGWRITER_CEPHFS_LABEL + ), "[CephFS] Data is lost" + logger.info("[CephFS] No data loss is seen") + assert sc_obj.check_for_data_loss( + constants.LOGWRITER_RBD_LABEL + ), "[RBD] Data is lost" + logger.info("[RBD] No data loss is seen") + + # check for data corruption + logreader_workload_factory( + pvc=sc_obj.get_workload_pvc_obj(constants.LOGWRITER_CEPHFS_LABEL)[0], + logreader_path=constants.LOGWRITER_CEPHFS_READER, + duration=5, + ) + sc_obj.get_logwriter_reader_pods(constants.LOGREADER_CEPHFS_LABEL) + + wait_for_pods_to_be_in_statuses( + expected_statuses=constants.STATUS_COMPLETED, + pod_names=[pod.name for pod in sc_obj.cephfs_logreader_pods], + timeout=900, + namespace=constants.STRETCH_CLUSTER_NAMESPACE, + ) + logger.info("[CephFS] Logreader job pods have reached 'Completed' state!") + + assert sc_obj.check_for_data_corruption( + label=constants.LOGREADER_CEPHFS_LABEL + ), "Data is corrupted for cephFS workloads" + logger.info("No data corruption is seen in CephFS workloads") + + assert sc_obj.check_for_data_corruption( + label=constants.LOGWRITER_RBD_LABEL + ), "Data is corrupted for RBD workloads" + logger.info("No data corruption is seen in RBD workloads")