Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Stretch cluster] test device replacement in stretch cluster #9548

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions ocs_ci/ocs/resources/pv.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,22 @@ def get_pv_status(pv_obj):
return pv_obj.get("status").get("phase")


def get_pv_in_status(storage_class, status="Bound"):
"""
It looks for pv with particular storageclass in particular status

Args:
storage_class (str): storage class
status (str): status of the pv
Returns:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add an empty line above this

list of pv objects

"""

pvs = [pv for pv in get_pv_objs_in_sc(storage_class) if get_pv_status(pv) == status]
return pvs


def get_pv_name(pv_obj):
"""
Get the name of the pv object
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from ocs_ci.framework.pytest_customization.marks import (
turquoise_squad,
stretchcluster_required,
tier1,
)
from ocs_ci.helpers.cnv_helpers import cal_md5sum_vm
from ocs_ci.ocs import constants
from ocs_ci.ocs.resources import storage_cluster
from ocs_ci.ocs.resources.pod import (
Expand All @@ -18,6 +20,7 @@
logger = logging.getLogger(__name__)


@tier1
@turquoise_squad
@stretchcluster_required
class TestAddCapacityStretchCluster:
Expand Down Expand Up @@ -85,6 +88,8 @@ def test_cluster_expansion(
setup_logwriter_rbd_workload_factory,
logreader_workload_factory,
iterations,
setup_cnv,
cnv_workload,
):
"""
Test cluster exapnsion and health when add capacity is performed
Expand All @@ -107,6 +112,13 @@ def test_cluster_expansion(
)
logger.info("All the workloads pods are successfully up and running")

# setup vm and write some data to the VM instance
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this code block (til line 160) is quite generic (it repeats itself in https://github.com/red-hat-storage/ocs-ci/pull/9548/files#diff-46f753f4e560b98288b4f1da2258354d49a5ad5d15b1e3110e64e02829a06ed5R26) and could be used for other tests that include VM workloads. Suggest moving it to a VM related helper file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed offline, created issue #11186 to track. Will be addressing this in the next release (4.19)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 to Elad

vm_obj = cnv_workload(volume_interface=constants.VM_VOLUME_PVC)
vm_obj.run_ssh_cmd(
command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400"
)
md5sum_before = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")

start_time = datetime.now(timezone.utc)

sc_obj.get_logfile_map(label=constants.LOGWRITER_CEPHFS_LABEL)
Expand All @@ -123,6 +135,29 @@ def test_cluster_expansion(
sc_obj.post_failure_checks(start_time, end_time, wait_for_read_completion=False)
logger.info("Successfully verified with post failure checks for the workloads")

# check vm data written before the failure for integrity
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this md5sum is captured after the failure. Please update the comment accordingly

md5sum_after = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")
assert (
md5sum_before == md5sum_after
), "Data integrity of the file inside VM is not maintained during the add capacity"
logger.info(
"Data integrity of the file inside VM is maintained during the add capacity"
)

# check if new data can be created
vm_obj.run_ssh_cmd(
command="dd if=/dev/zero of=/file_2.txt bs=1024 count=103600"
)
logger.info("Successfully created new data inside VM")

# check if the data can be copied back to local machine
vm_obj.scp_from_vm(local_path="/tmp", vm_src_path="/file_1.txt")
logger.info("VM data is successfully copied back to local machine")

# stop the VM
vm_obj.stop()
logger.info("Stoped the VM successfully")
Comment on lines +157 to +159
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the need to stop the VM?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is included part of test to see if we can stop the vm post recovery


sc_obj.cephfs_logreader_job.delete()
logger.info(sc_obj.cephfs_logreader_pods)
for pod in sc_obj.cephfs_logreader_pods:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import logging
from datetime import datetime, timezone

from ocs_ci.framework.pytest_customization.marks import (
stretchcluster_required,
turquoise_squad,
polarion_id,
tier1,
)
from ocs_ci.helpers.cnv_helpers import cal_md5sum_vm
from ocs_ci.ocs import constants
from ocs_ci.ocs.resources.pod import wait_for_pods_to_be_in_statuses

from ocs_ci.ocs.osd_operations import osd_device_replacement
from ocs_ci.ocs.resources.stretchcluster import StretchCluster

logger = logging.getLogger(__name__)


@tier1
@stretchcluster_required
mashetty330 marked this conversation as resolved.
Show resolved Hide resolved
@turquoise_squad
class TestDeviceReplacementInStretchCluster:

@polarion_id("OCS-5047")
def test_device_replacement(
self,
nodes,
setup_logwriter_cephfs_workload_factory,
setup_logwriter_rbd_workload_factory,
logreader_workload_factory,
cnv_workload,
setup_cnv,
):
"""
Test device replacement in stretch cluster while logwriter workload
for both CephFs and RBD is running

Steps:
1) Run logwriter/reader workload for both CephFs and RBD volumes
2) Perform device replacement procedure
3) Verify no data loss
4) Verify no data corruption

"""

sc_obj = StretchCluster()

# setup logwriter workloads in the background
(
sc_obj.cephfs_logwriter_dep,
sc_obj.cephfs_logreader_job,
) = setup_logwriter_cephfs_workload_factory(read_duration=0)

sc_obj.get_logwriter_reader_pods(label=constants.LOGWRITER_CEPHFS_LABEL)
sc_obj.get_logwriter_reader_pods(label=constants.LOGREADER_CEPHFS_LABEL)
sc_obj.get_logwriter_reader_pods(
label=constants.LOGWRITER_RBD_LABEL, exp_num_replicas=2
)
logger.info("All the workloads pods are successfully up and running")

# setup vm and write some data to the VM instance
vm_obj = cnv_workload(volume_interface=constants.VM_VOLUME_PVC)
vm_obj.run_ssh_cmd(
command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400"
)
md5sum_before = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")

start_time = datetime.now(timezone.utc)

sc_obj.get_logfile_map(label=constants.LOGWRITER_CEPHFS_LABEL)
sc_obj.get_logfile_map(label=constants.LOGWRITER_RBD_LABEL)

# run device replacement procedure
logger.info("Running device replacement procedure now")
osd_device_replacement(nodes)

# check Io for any failures
end_time = datetime.now(timezone.utc)
sc_obj.post_failure_checks(start_time, end_time, wait_for_read_completion=False)
logger.info("Successfully verified with post failure checks for the workloads")

# check vm data written before the failure for integrity
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this md5sum is captured after the failure. Please update the comment accordingly

md5sum_after = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")
assert (
md5sum_before == md5sum_after
), "Data integrity of the file inside VM is not maintained during the device replacement"
logger.info(
"Data integrity of the file inside VM is maintained during the device replacement"
)

# check if new data can be created
vm_obj.run_ssh_cmd(
command="dd if=/dev/zero of=/file_2.txt bs=1024 count=103600"
)
logger.info("Successfully created new data inside VM")

# check if the data can be copied back to local machine
vm_obj.scp_from_vm(local_path="/tmp", vm_src_path="/file_1.txt")
logger.info("VM data is successfully copied back to local machine")

# stop the VM
vm_obj.stop()
logger.info("Stoped the VM successfully")
Comment on lines +102 to +104
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the need to stop the VM?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is included part of test to see if we can stop the vm post recovery


sc_obj.cephfs_logreader_job.delete()
logger.info(sc_obj.cephfs_logreader_pods)
for pod in sc_obj.cephfs_logreader_pods:
pod.wait_for_pod_delete(timeout=120)
logger.info("All old CephFS logreader pods are deleted")

# check for any data loss
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checking for data loss and data corruption is quite generic and can be used in other stretch cluster tests as well. I would suggest moving it to a common location outside the test and calling that function within the test

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ack. will take care of it as part of #11186 in the next release

assert sc_obj.check_for_data_loss(
constants.LOGWRITER_CEPHFS_LABEL
), "[CephFS] Data is lost"
logger.info("[CephFS] No data loss is seen")
assert sc_obj.check_for_data_loss(
constants.LOGWRITER_RBD_LABEL
), "[RBD] Data is lost"
logger.info("[RBD] No data loss is seen")

# check for data corruption
logreader_workload_factory(
pvc=sc_obj.get_workload_pvc_obj(constants.LOGWRITER_CEPHFS_LABEL)[0],
logreader_path=constants.LOGWRITER_CEPHFS_READER,
duration=5,
)
sc_obj.get_logwriter_reader_pods(constants.LOGREADER_CEPHFS_LABEL)

wait_for_pods_to_be_in_statuses(
expected_statuses=constants.STATUS_COMPLETED,
pod_names=[pod.name for pod in sc_obj.cephfs_logreader_pods],
timeout=900,
namespace=constants.STRETCH_CLUSTER_NAMESPACE,
)
logger.info("[CephFS] Logreader job pods have reached 'Completed' state!")

assert sc_obj.check_for_data_corruption(
label=constants.LOGREADER_CEPHFS_LABEL
), "Data is corrupted for cephFS workloads"
logger.info("No data corruption is seen in CephFS workloads")

assert sc_obj.check_for_data_corruption(
label=constants.LOGWRITER_RBD_LABEL
), "Data is corrupted for RBD workloads"
logger.info("No data corruption is seen in RBD workloads")
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
polarion_id,
stretchcluster_required,
turquoise_squad,
tier2,
)
from ocs_ci.helpers.cnv_helpers import cal_md5sum_vm
from ocs_ci.helpers.helpers import modify_deployment_replica_count
Expand Down Expand Up @@ -147,7 +148,7 @@ def setup_cnv_workload(request, cnv_workload_class, setup_cnv):
logger.info("Setting up CNV workload and creating some data")
vm_obj = cnv_workload_class(
volume_interface=constants.VM_VOLUME_PVC, namespace=CNV_WORKLOAD_NAMESPACE
)[0]
)
vm_obj.run_ssh_cmd(command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400")
md5sum_before = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")

Expand Down Expand Up @@ -181,6 +182,7 @@ def finalizer():
request.addfinalizer(finalizer)


@tier2
@turquoise_squad
@stretchcluster_required
@pytest.mark.usefixtures("setup_cnv_workload")
Expand Down
Loading