-
Notifications
You must be signed in to change notification settings - Fork 170
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Stretch cluster] test device replacement in stretch cluster #9548
base: master
Are you sure you want to change the base?
Changes from all commits
d152252
ddd260d
8b00a27
a250e1d
64a4d37
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,9 @@ | |
from ocs_ci.framework.pytest_customization.marks import ( | ||
turquoise_squad, | ||
stretchcluster_required, | ||
tier1, | ||
) | ||
from ocs_ci.helpers.cnv_helpers import cal_md5sum_vm | ||
from ocs_ci.ocs import constants | ||
from ocs_ci.ocs.resources import storage_cluster | ||
from ocs_ci.ocs.resources.pod import ( | ||
|
@@ -18,6 +20,7 @@ | |
logger = logging.getLogger(__name__) | ||
|
||
|
||
@tier1 | ||
@turquoise_squad | ||
@stretchcluster_required | ||
class TestAddCapacityStretchCluster: | ||
|
@@ -85,6 +88,8 @@ def test_cluster_expansion( | |
setup_logwriter_rbd_workload_factory, | ||
logreader_workload_factory, | ||
iterations, | ||
setup_cnv, | ||
cnv_workload, | ||
): | ||
""" | ||
Test cluster exapnsion and health when add capacity is performed | ||
|
@@ -107,6 +112,13 @@ def test_cluster_expansion( | |
) | ||
logger.info("All the workloads pods are successfully up and running") | ||
|
||
# setup vm and write some data to the VM instance | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this code block (til line 160) is quite generic (it repeats itself in https://github.com/red-hat-storage/ocs-ci/pull/9548/files#diff-46f753f4e560b98288b4f1da2258354d49a5ad5d15b1e3110e64e02829a06ed5R26) and could be used for other tests that include VM workloads. Suggest moving it to a VM related helper file. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed offline, created issue #11186 to track. Will be addressing this in the next release (4.19) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 to Elad |
||
vm_obj = cnv_workload(volume_interface=constants.VM_VOLUME_PVC) | ||
vm_obj.run_ssh_cmd( | ||
command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400" | ||
) | ||
md5sum_before = cal_md5sum_vm(vm_obj, file_path="/file_1.txt") | ||
|
||
start_time = datetime.now(timezone.utc) | ||
|
||
sc_obj.get_logfile_map(label=constants.LOGWRITER_CEPHFS_LABEL) | ||
|
@@ -123,6 +135,29 @@ def test_cluster_expansion( | |
sc_obj.post_failure_checks(start_time, end_time, wait_for_read_completion=False) | ||
logger.info("Successfully verified with post failure checks for the workloads") | ||
|
||
# check vm data written before the failure for integrity | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this md5sum is captured after the failure. Please update the comment accordingly |
||
md5sum_after = cal_md5sum_vm(vm_obj, file_path="/file_1.txt") | ||
assert ( | ||
md5sum_before == md5sum_after | ||
), "Data integrity of the file inside VM is not maintained during the add capacity" | ||
logger.info( | ||
"Data integrity of the file inside VM is maintained during the add capacity" | ||
) | ||
|
||
# check if new data can be created | ||
vm_obj.run_ssh_cmd( | ||
command="dd if=/dev/zero of=/file_2.txt bs=1024 count=103600" | ||
) | ||
logger.info("Successfully created new data inside VM") | ||
|
||
# check if the data can be copied back to local machine | ||
vm_obj.scp_from_vm(local_path="/tmp", vm_src_path="/file_1.txt") | ||
logger.info("VM data is successfully copied back to local machine") | ||
|
||
# stop the VM | ||
vm_obj.stop() | ||
logger.info("Stoped the VM successfully") | ||
Comment on lines
+157
to
+159
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is the need to stop the VM? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is included part of test to see if we can stop the vm post recovery |
||
|
||
sc_obj.cephfs_logreader_job.delete() | ||
logger.info(sc_obj.cephfs_logreader_pods) | ||
for pod in sc_obj.cephfs_logreader_pods: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
import logging | ||
from datetime import datetime, timezone | ||
|
||
from ocs_ci.framework.pytest_customization.marks import ( | ||
stretchcluster_required, | ||
turquoise_squad, | ||
polarion_id, | ||
tier1, | ||
) | ||
from ocs_ci.helpers.cnv_helpers import cal_md5sum_vm | ||
from ocs_ci.ocs import constants | ||
from ocs_ci.ocs.resources.pod import wait_for_pods_to_be_in_statuses | ||
|
||
from ocs_ci.ocs.osd_operations import osd_device_replacement | ||
from ocs_ci.ocs.resources.stretchcluster import StretchCluster | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@tier1 | ||
@stretchcluster_required | ||
mashetty330 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
@turquoise_squad | ||
class TestDeviceReplacementInStretchCluster: | ||
|
||
@polarion_id("OCS-5047") | ||
def test_device_replacement( | ||
self, | ||
nodes, | ||
setup_logwriter_cephfs_workload_factory, | ||
setup_logwriter_rbd_workload_factory, | ||
logreader_workload_factory, | ||
cnv_workload, | ||
setup_cnv, | ||
): | ||
""" | ||
Test device replacement in stretch cluster while logwriter workload | ||
for both CephFs and RBD is running | ||
|
||
Steps: | ||
1) Run logwriter/reader workload for both CephFs and RBD volumes | ||
2) Perform device replacement procedure | ||
3) Verify no data loss | ||
4) Verify no data corruption | ||
|
||
""" | ||
|
||
sc_obj = StretchCluster() | ||
|
||
# setup logwriter workloads in the background | ||
( | ||
sc_obj.cephfs_logwriter_dep, | ||
sc_obj.cephfs_logreader_job, | ||
) = setup_logwriter_cephfs_workload_factory(read_duration=0) | ||
|
||
sc_obj.get_logwriter_reader_pods(label=constants.LOGWRITER_CEPHFS_LABEL) | ||
sc_obj.get_logwriter_reader_pods(label=constants.LOGREADER_CEPHFS_LABEL) | ||
sc_obj.get_logwriter_reader_pods( | ||
label=constants.LOGWRITER_RBD_LABEL, exp_num_replicas=2 | ||
) | ||
logger.info("All the workloads pods are successfully up and running") | ||
|
||
# setup vm and write some data to the VM instance | ||
vm_obj = cnv_workload(volume_interface=constants.VM_VOLUME_PVC) | ||
vm_obj.run_ssh_cmd( | ||
command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400" | ||
) | ||
md5sum_before = cal_md5sum_vm(vm_obj, file_path="/file_1.txt") | ||
|
||
start_time = datetime.now(timezone.utc) | ||
|
||
sc_obj.get_logfile_map(label=constants.LOGWRITER_CEPHFS_LABEL) | ||
sc_obj.get_logfile_map(label=constants.LOGWRITER_RBD_LABEL) | ||
|
||
# run device replacement procedure | ||
logger.info("Running device replacement procedure now") | ||
osd_device_replacement(nodes) | ||
|
||
# check Io for any failures | ||
end_time = datetime.now(timezone.utc) | ||
sc_obj.post_failure_checks(start_time, end_time, wait_for_read_completion=False) | ||
logger.info("Successfully verified with post failure checks for the workloads") | ||
|
||
# check vm data written before the failure for integrity | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this md5sum is captured after the failure. Please update the comment accordingly |
||
md5sum_after = cal_md5sum_vm(vm_obj, file_path="/file_1.txt") | ||
assert ( | ||
md5sum_before == md5sum_after | ||
), "Data integrity of the file inside VM is not maintained during the device replacement" | ||
logger.info( | ||
"Data integrity of the file inside VM is maintained during the device replacement" | ||
) | ||
|
||
# check if new data can be created | ||
vm_obj.run_ssh_cmd( | ||
command="dd if=/dev/zero of=/file_2.txt bs=1024 count=103600" | ||
) | ||
logger.info("Successfully created new data inside VM") | ||
|
||
# check if the data can be copied back to local machine | ||
vm_obj.scp_from_vm(local_path="/tmp", vm_src_path="/file_1.txt") | ||
logger.info("VM data is successfully copied back to local machine") | ||
|
||
# stop the VM | ||
vm_obj.stop() | ||
logger.info("Stoped the VM successfully") | ||
Comment on lines
+102
to
+104
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is the need to stop the VM? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is included part of test to see if we can stop the vm post recovery |
||
|
||
sc_obj.cephfs_logreader_job.delete() | ||
logger.info(sc_obj.cephfs_logreader_pods) | ||
for pod in sc_obj.cephfs_logreader_pods: | ||
pod.wait_for_pod_delete(timeout=120) | ||
logger.info("All old CephFS logreader pods are deleted") | ||
|
||
# check for any data loss | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Checking for data loss and data corruption is quite generic and can be used in other stretch cluster tests as well. I would suggest moving it to a common location outside the test and calling that function within the test There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ack. will take care of it as part of #11186 in the next release |
||
assert sc_obj.check_for_data_loss( | ||
constants.LOGWRITER_CEPHFS_LABEL | ||
), "[CephFS] Data is lost" | ||
logger.info("[CephFS] No data loss is seen") | ||
assert sc_obj.check_for_data_loss( | ||
constants.LOGWRITER_RBD_LABEL | ||
), "[RBD] Data is lost" | ||
logger.info("[RBD] No data loss is seen") | ||
|
||
# check for data corruption | ||
logreader_workload_factory( | ||
pvc=sc_obj.get_workload_pvc_obj(constants.LOGWRITER_CEPHFS_LABEL)[0], | ||
logreader_path=constants.LOGWRITER_CEPHFS_READER, | ||
duration=5, | ||
) | ||
sc_obj.get_logwriter_reader_pods(constants.LOGREADER_CEPHFS_LABEL) | ||
|
||
wait_for_pods_to_be_in_statuses( | ||
expected_statuses=constants.STATUS_COMPLETED, | ||
pod_names=[pod.name for pod in sc_obj.cephfs_logreader_pods], | ||
timeout=900, | ||
namespace=constants.STRETCH_CLUSTER_NAMESPACE, | ||
) | ||
logger.info("[CephFS] Logreader job pods have reached 'Completed' state!") | ||
|
||
assert sc_obj.check_for_data_corruption( | ||
label=constants.LOGREADER_CEPHFS_LABEL | ||
), "Data is corrupted for cephFS workloads" | ||
logger.info("No data corruption is seen in CephFS workloads") | ||
|
||
assert sc_obj.check_for_data_corruption( | ||
label=constants.LOGWRITER_RBD_LABEL | ||
), "Data is corrupted for RBD workloads" | ||
logger.info("No data corruption is seen in RBD workloads") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add an empty line above this