From a272f6207e593f91b0ca4ab20180da6488403b8f Mon Sep 17 00:00:00 2001 From: Petr Balogh Date: Fri, 17 Jan 2025 17:53:20 +0100 Subject: [PATCH] Improvements for MG collections Fixes: #10526 Fixes: #11159 Several improvements in MG logs like preventing running MG over and over when it's still failing or getting timeouted. Collecting OCP logs for Ecosystem tests like upgrade decorated with purple squad. Not collect logs again at the end of execution on success when it was collected at least once during execution by some failed test. Signed-off-by: Petr Balogh --- conf/README.md | 4 +- ocs_ci/deployment/deployment.py | 18 ++- ocs_ci/framework/conf/default_config.yaml | 1 + .../pytest_customization/ocscilib.py | 34 ++++- ocs_ci/ocs/defaults.py | 1 + ocs_ci/ocs/utils.py | 127 ++++++++++++++---- ocs_ci/utility/utils.py | 22 ++- tests/conftest.py | 54 ++++++-- 8 files changed, 211 insertions(+), 50 deletions(-) diff --git a/conf/README.md b/conf/README.md index b4fdcad163c..fb1c4927cc6 100644 --- a/conf/README.md +++ b/conf/README.md @@ -183,6 +183,8 @@ Reporting related config. (Do not store secret data in the repository!). * `save_mem_report` - If True, test run memory report CSV file will be saved in `RUN["log_dir"]/stats_log_dir_` directory along with .peak_rss_table, .peak_vms_table reports. The option may be enforced by exporting env variable: export SAVE_MEM_REPORT=true +* `max_mg_fail_attempts` - Maximum attempts to run MG commands to prevent + spending time on MG which is timeouting. #### ENV_DATA @@ -364,7 +366,7 @@ Upgrade related configuration data. * `upgrade_logging_channel` - OCP logging channel to upgrade with * `upgrade_ui` - Perform upgrade via UI (Not all the versions are supported, please look at the code) * `upgrade_acm_version` - ACM version to which we have to upgrade -* `upgrade_acm_registry_image` - ACM Image tag from brew which should be used to upgrade +* `upgrade_acm_registry_image` - ACM Image tag from brew which should be used to upgrade example: /rh-osbs/iib:565330 #### AUTH diff --git a/ocs_ci/deployment/deployment.py b/ocs_ci/deployment/deployment.py index 3538d00900e..03b46e271d5 100644 --- a/ocs_ci/deployment/deployment.py +++ b/ocs_ci/deployment/deployment.py @@ -231,7 +231,11 @@ def do_deploy_ocp(self, log_cli_level): config.RUN["is_ocp_deployment_failed"] = True logger.error(e) if config.REPORTING["gather_on_deploy_failure"]: - collect_ocs_logs("deployment", ocs=False) + collect_ocs_logs( + "deployment", + ocs=False, + timeout=defaults.MUST_GATHER_TIMEOUT, + ) raise def do_deploy_submariner(self): @@ -381,8 +385,16 @@ def do_deploy_ocs(self): if config.REPORTING["gather_on_deploy_failure"]: # Let's do the collections separately to guard against one # of them failing - collect_ocs_logs("deployment", ocs=False) - collect_ocs_logs("deployment", ocp=False) + collect_ocs_logs( + "deployment", + ocs=False, + timeout=defaults.MUST_GATHER_TIMEOUT, + ) + collect_ocs_logs( + "deployment", + ocp=False, + timeout=defaults.MUST_GATHER_TIMEOUT, + ) raise config.reset_ctx() # Run ocs_install_verification here only in case of multicluster. diff --git a/ocs_ci/framework/conf/default_config.yaml b/ocs_ci/framework/conf/default_config.yaml index 82a9ae26142..494f5b7d886 100644 --- a/ocs_ci/framework/conf/default_config.yaml +++ b/ocs_ci/framework/conf/default_config.yaml @@ -141,6 +141,7 @@ REPORTING: gather_on_deploy_failure: true collect_logs_on_success_run: False rp_client_log_level: "ERROR" + max_mg_fail_attempts: 3 # This is the default information about environment. ENV_DATA: diff --git a/ocs_ci/framework/pytest_customization/ocscilib.py b/ocs_ci/framework/pytest_customization/ocscilib.py index 958ac3fd834..cca6771f3f6 100644 --- a/ocs_ci/framework/pytest_customization/ocscilib.py +++ b/ocs_ci/framework/pytest_customization/ocscilib.py @@ -20,6 +20,7 @@ ClusterNameNotProvidedError, ClusterPathNotProvidedError, ) +from ocs_ci.ocs import defaults from ocs_ci.ocs.constants import ( CLUSTER_NAME_MAX_CHARACTERS, CLUSTER_NAME_MIN_CHARACTERS, @@ -31,7 +32,7 @@ ) from ocs_ci.ocs.cluster import check_clusters from ocs_ci.ocs.resources.ocs import get_version_info -from ocs_ci.ocs.utils import collect_ocs_logs, collect_prometheus_metrics +from ocs_ci.ocs import utils from ocs_ci.utility.utils import ( dump_config_to_file, get_ceph_version, @@ -729,34 +730,53 @@ def pytest_runtest_makereport(item, call): and ocsci_config.RUN.get("cli_params").get("collect-logs") and not ocsci_config.RUN.get("cli_params").get("deploy") ): + item_markers = {marker.name for marker in item.iter_markers()} test_case_name = item.name + # TODO: We should avoid paths and rely on markers issue: + # https://github.com/red-hat-storage/ocs-ci/issues/10526 ocp_logs_collection = ( True if any( x in item.location[0] for x in [ - "ecosystem", - "e2e/performance", "tests/functional/z_cluster", ] ) else False ) + ocp_markers_to_collect = { + "performance", + "purple_squad", + } + if ocp_markers_to_collect & item_markers: + ocp_logs_collection = True ocs_logs_collection = ( False if any(x in item.location[0] for x in ["_ui", "must_gather"]) else True ) - mcg_logs_collection = ( - True if any(x in item.location[0] for x in ["mcg", "ecosystem"]) else False + mcg_markers_to_collect = { + "mcg", + "purple_squad", + } + # For every failure in MG we are trying to extend next attempt by 20 minutes + adjusted_timeout = utils.mg_fail_count * 1200 + timeout = ocsci_config.REPORTING.get( + "must_gather_timeout", defaults.MUST_GATHER_TIMEOUT + adjusted_timeout ) + log.info(f"Adjusted timeout for MG is {timeout} seconds") + mcg_logs_collection = bool(mcg_markers_to_collect & item_markers) try: if not ocsci_config.RUN.get("is_ocp_deployment_failed"): - collect_ocs_logs( + utils.collect_ocs_logs( dir_name=test_case_name, ocp=ocp_logs_collection, ocs=ocs_logs_collection, mcg=mcg_logs_collection, + silent=True, + output_file=True, + skip_after_max_fail=True, + timeout=timeout, ) except Exception: log.exception("Failed to collect OCS logs") @@ -770,7 +790,7 @@ def pytest_runtest_makereport(item, call): metrics = item.get_closest_marker("gather_metrics_on_fail").args try: threading_lock = call.getfixturevalue("threading_lock") - collect_prometheus_metrics( + utils.collect_prometheus_metrics( metrics, f"{item.name}-{call.when}", call.start, diff --git a/ocs_ci/ocs/defaults.py b/ocs_ci/ocs/defaults.py index 437a4f350b0..d6437ed7c77 100644 --- a/ocs_ci/ocs/defaults.py +++ b/ocs_ci/ocs/defaults.py @@ -168,6 +168,7 @@ # Must-gather: MUST_GATHER_UPSTREAM_IMAGE = "quay.io/ocs-dev/ocs-must-gather" MUST_GATHER_UPSTREAM_TAG = "latest" +MUST_GATHER_TIMEOUT = "3600" # CrushDeviceClass CRUSH_DEVICE_CLASS = "ssd" diff --git a/ocs_ci/ocs/utils.py b/ocs_ci/ocs/utils.py index f2dfbfdee96..57d2ce67179 100644 --- a/ocs_ci/ocs/utils.py +++ b/ocs_ci/ocs/utils.py @@ -4,6 +4,7 @@ import os import pickle import re +import threading import time import traceback import subprocess @@ -21,7 +22,7 @@ from paramiko.ssh_exception import SSHException from ocs_ci.framework import config as ocsci_config, config -from ocs_ci.ocs import constants +from ocs_ci.ocs import constants, defaults from ocs_ci.ocs.external_ceph import RolesContainer, Ceph, CephNode from ocs_ci.ocs.clients import WinNode from ocs_ci.ocs.exceptions import ( @@ -56,6 +57,11 @@ log = logging.getLogger(__name__) +mg_fail_count = 0 +mg_last_fail = None +mg_collected_logs = 0 +mg_collected_types = set() +mg_lock = threading.Lock() def create_ceph_nodes(cluster_conf, inventory, osp_cred, run_id, instances_name=None): @@ -933,7 +939,16 @@ def apply_oc_resource( occli.apply(cfg_file) -def run_must_gather(log_dir_path, image, command=None, cluster_config=None): +def run_must_gather( + log_dir_path, + image, + command=None, + cluster_config=None, + silent=False, + output_file=None, + skip_after_max_fail=False, + timeout=defaults.MUST_GATHER_TIMEOUT, +): """ Runs the must-gather tool against the cluster @@ -942,26 +957,33 @@ def run_must_gather(log_dir_path, image, command=None, cluster_config=None): image (str): must-gather image registry path command (str): optional command to execute within the must-gather image cluster_config (MultiClusterConfig): Holds specifc cluster config object in case of multicluster + silent (bool): True if silent mode + output_file (bool): True if direct whole output to file instead of printing it out to log (apply + only if silent is True). + skip_after_max_fail (bool): When max number failed attempts to collect MG reached, will skip + MG collection. + timeout (int): Max timeout to wait for MG to complete before aborting the MG execution. Returns: mg_output (str): must-gather cli output """ - # Must-gather has many changes on 4.6 which add more time to the collection. - # https://github.com/red-hat-storage/ocs-ci/issues/3240 + global mg_fail_count, mg_last_fail, mg_collected_logs + + max_mg_fail_attempts = config.REPORTING.get("max_mg_fail_attempts") + if skip_after_max_fail: + with mg_lock: + if mg_fail_count > max_mg_fail_attempts: + log.warning( + f"MG collection is skipped because MG already failed {mg_fail_count} times!" + f" Last error occurred at: {mg_last_fail}" + ) + return if not cluster_config: cluster_config = ocsci_config mg_output = "" - ocs_version = version.get_semantic_ocs_version_from_config() - if ocs_version >= version.VERSION_4_10: - timeout = 2100 - elif ocs_version >= version.VERSION_4_6: - timeout = 1500 - else: - timeout = 600 - - must_gather_timeout = cluster_config.REPORTING.get("must_gather_timeout", timeout) + timestamp = time.time() log.info(f"Must gather image: {image} will be used.") create_directory_path(log_dir_path) cmd = f"adm must-gather --image={image} --dest-dir={log_dir_path}" @@ -969,29 +991,33 @@ def run_must_gather(log_dir_path, image, command=None, cluster_config=None): cmd += f" -- {command}" log.info(f"OCS logs will be placed in location {log_dir_path}") + if output_file: + output_file = os.path.join(log_dir_path, f"mg_output_{timestamp}.log") + log.info(f"Must gather std error log will be placed in: {output_file}") occli = OCP() try: mg_output = occli.exec_oc_cmd( cmd, out_yaml_format=False, - timeout=must_gather_timeout, + timeout=timeout, cluster_config=cluster_config, + silent=silent, + output_file=output_file, ) if config.DEPLOYMENT["external_mode"]: collect_ceph_external(path=log_dir_path) - except CommandFailed as ex: - log.error( - f"Failed during must gather logs! Error: {ex}" - f"Must-Gather Output: {mg_output}" - ) + with mg_lock: + mg_collected_logs += 1 + except (CommandFailed, TimeoutExpired) as ex: + log.error(f"Failed during must gather logs! Error: {ex}") + with mg_lock: + mg_fail_count += 1 + mg_last_fail = datetime.datetime.now() + + if mg_output: + log.error(f"Must-Gather Output: {mg_output}") export_mg_pods_logs(log_dir_path=log_dir_path) - except TimeoutExpired as ex: - log.error( - f"Failed during must gather logs! Error: {ex}" - f"Must-Gather Output: {mg_output}" - ) - export_mg_pods_logs(log_dir_path=log_dir_path) return mg_output @@ -1176,11 +1202,16 @@ def _collect_ocs_logs( mcg=False, status_failure=True, ocs_flags=None, + silent=False, + output_file=None, + skip_after_max_fail=False, + timeout=defaults.MUST_GATHER_TIMEOUT, ): """ This function runs in thread """ + global mg_collected_types log.info( ( f"RUNNING IN CTX: {cluster_config.ENV_DATA['cluster_name']} RUNID: = {cluster_config.RUN['run_id']}" @@ -1232,7 +1263,12 @@ def _collect_ocs_logs( ocs_must_gather_image_and_tag, cluster_config=cluster_config, command=ocs_flags, + silent=silent, + output_file=output_file, + skip_after_max_fail=skip_after_max_fail, + timeout=timeout, ) + mg_collected_types.add("ocs") if ( ocsci_config.DEPLOYMENT.get("disconnected") and "cannot stat 'jq'" in mg_output @@ -1246,14 +1282,23 @@ def _collect_ocs_logs( if cluster_config.DEPLOYMENT.get("disconnected"): ocp_must_gather_image = mirror_image(ocp_must_gather_image) run_must_gather( - ocp_log_dir_path, ocp_must_gather_image, cluster_config=cluster_config + ocp_log_dir_path, + ocp_must_gather_image, + cluster_config=cluster_config, + output_file=output_file, + skip_after_max_fail=skip_after_max_fail, + timeout=timeout, ) run_must_gather( ocp_log_dir_path, ocp_must_gather_image, "/usr/bin/gather_service_logs worker", cluster_config=cluster_config, + output_file=output_file, + skip_after_max_fail=skip_after_max_fail, + timeout=timeout, ) + mg_collected_types.add("ocp") if mcg: counter = 0 while counter < 5: @@ -1266,6 +1311,7 @@ def _collect_ocs_logs( ): break collect_noobaa_db_dump(log_dir_path, cluster_config) + mg_collected_types.add("mcg") break except CommandFailed as ex: log.error(f"Failed to dump noobaa DB! Error: {ex}") @@ -1327,7 +1373,16 @@ def _collect_ocs_logs( def collect_ocs_logs( - dir_name, ocp=True, ocs=True, mcg=False, status_failure=True, ocs_flags=None + dir_name, + ocp=True, + ocs=True, + mcg=False, + status_failure=True, + ocs_flags=None, + silent=False, + output_file=None, + skip_after_max_fail=False, + timeout=defaults.MUST_GATHER_TIMEOUT, ): """ Collects OCS logs @@ -1341,6 +1396,12 @@ def collect_ocs_logs( status_failure (bool): Whether the collection is after success or failure, allows better naming for folders under logs directory ocs_flags (str): flags to ocs must gather command for example ["-- /usr/bin/gather -cs"] + silent (bool): True if silent mode + output_file (bool): True if direct whole output to file instead of printing it out to log (apply + only if silent is True). + skip_after_max_fail (bool): When max number failed attempts to collect MG reached, will skip + MG collection. + timeout (int): Max timeout to wait for MG to complete before aborting the MG execution. """ results = list() @@ -1357,6 +1418,10 @@ def collect_ocs_logs( mcg=False, status_failure=status_failure, ocs_flags=ocs_flags, + silent=silent, + output_file=output_file, + skip_after_max_fail=skip_after_max_fail, + timeout=timeout, ) ) if ocs: @@ -1370,6 +1435,10 @@ def collect_ocs_logs( mcg=False, status_failure=status_failure, ocs_flags=ocs_flags, + silent=silent, + output_file=output_file, + skip_after_max_fail=skip_after_max_fail, + timeout=timeout, ) ) if mcg: @@ -1383,6 +1452,10 @@ def collect_ocs_logs( mcg=mcg, status_failure=status_failure, ocs_flags=ocs_flags, + silent=silent, + output_file=output_file, + skip_after_max_fail=skip_after_max_fail, + timeout=timeout, ) ) diff --git a/ocs_ci/utility/utils.py b/ocs_ci/utility/utils.py index 624f7eebf8c..fc500517f5e 100644 --- a/ocs_ci/utility/utils.py +++ b/ocs_ci/utility/utils.py @@ -461,6 +461,7 @@ def run_cmd( threading_lock=None, silent=False, cluster_config=None, + output_file=None, **kwargs, ): """ @@ -478,6 +479,7 @@ def run_cmd( threading_lock (threading.RLock): threading.RLock object that is used for handling concurrent oc commands silent (bool): If True will silent errors from the server, default false + output_file (str): path where to write stderr from command - apply only when silent mode is True Raises: CommandFailed: In case the command execution fails @@ -493,6 +495,7 @@ def run_cmd( threading_lock, silent=silent, cluster_config=cluster_config, + output_file=output_file, **kwargs, ) return mask_secrets(completed_process.stdout.decode(), secrets) @@ -600,6 +603,7 @@ def exec_cmd( use_shell=False, cluster_config=None, lock_timeout=7200, + output_file=None, **kwargs, ): """ @@ -623,6 +627,7 @@ def exec_cmd( cluster_config (MultiClusterConfig): In case of multicluster environment this object will be non-null lock_timeout (int): maximum timeout to wait for lock to prevent deadlocks (default 2 hours) + output_file (str): path where to write output of stderr from command - apply only when silent mode is True Raises: CommandFailed: In case the command execution fails @@ -696,8 +701,13 @@ def exec_cmd( if len(completed_process.stderr) > 0: if not silent: log.warning(f"Command stderr: {masked_stderr}") + else: + if output_file: + with open(output_file, "a") as out_fd: + out_fd.write(masked_stderr) else: - log.debug("Command stderr is empty") + if not silent: + log.debug("Command stderr is empty") log.debug(f"Command return code: {completed_process.returncode}") if completed_process.returncode and not ignore_error: masked_stderr = bin_xml_escape(filter_out_emojis(masked_stderr)) @@ -1712,6 +1722,16 @@ def parse_html_for_email(soup): skipped[0].string.replace_with( f"{skipped_number} skipped ({skips_ceph_health_ratio * 100}% on Ceph health)" ) + from ocs import utils + + if utils.mg_fail_count: + failed_mg_text = soup.new_tag("b") + failed_mg_text.string = ( + f"In this execution Must Gather collection has failed: {utils.mg_fail_count} times!" + " Please check why this has happened!" + ) + main_header = soup.find("h1") + main_header.insert_after(failed_mg_text) main_header = soup.find("h1") main_header.string.replace_with("OCS-CI RESULTS") diff --git a/tests/conftest.py b/tests/conftest.py index f2abdfbd44b..a2834c8fb0d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -79,12 +79,7 @@ from ocs_ci.ocs.resources.mcg_replication_policy import AwsLogBasedReplicationPolicy from ocs_ci.ocs.resources.mockup_bucket_logger import MockupBucketLogger from ocs_ci.ocs.scale_lib import FioPodScale -from ocs_ci.ocs.utils import ( - setup_ceph_toolbox, - collect_ocs_logs, - collect_pod_container_rpm_package, - get_dr_operator_versions, -) +from ocs_ci.ocs import utils from ocs_ci.ocs.resources.deployment import Deployment from ocs_ci.ocs.resources.job import get_job_obj from ocs_ci.ocs.resources.backingstore import ( @@ -1752,7 +1747,7 @@ def additional_testsuite_properties(record_testsuite_property, pytestconfig): # add markers as separated property markers = ocsci_config.RUN["cli_params"].get("-m", "").replace(" ", "-") record_testsuite_property("rp_markers", markers) - dr_operator_versions = get_dr_operator_versions() + dr_operator_versions = utils.get_dr_operator_versions() for dr_operator_name, dr_operator_version in dr_operator_versions.items(): record_testsuite_property(f"rp_{dr_operator_name}", dr_operator_version) @@ -4065,7 +4060,7 @@ def ceph_toolbox(request): ): try: # Creating toolbox pod - setup_ceph_toolbox() + utils.setup_ceph_toolbox() except CommandFailed: log.info("Failed to create toolbox") @@ -4615,10 +4610,47 @@ def finalizer(): if not ocsci_config.RUN["cli_params"].get("deploy") and not ocsci_config.RUN[ "cli_params" ].get("teardown"): + failure_in_mg = [] if ocsci_config.REPORTING["collect_logs_on_success_run"]: - collect_ocs_logs("testcases", ocs=False, status_failure=False) - collect_ocs_logs("testcases", ocp=False, status_failure=False) - collect_pod_container_rpm_package("testcases") + for mg_target in ["ocs", "ocp"]: + try: + if ( + utils.mg_collected_logs + and mg_target in utils.mg_collected_types + and ( + utils.mg_fail_count + < config.REPORTING["max_mg_fail_attempts"] + ) + ): + log.info( + f"Skipping {mg_target} MG Collection as we have collected logs at least once during run" + ) + else: + timeout = defaults.MUST_GATHER_TIMEOUT + if mg_target == "ocs": + timeout = 7200 + log.info("Collecting OCP logs on Success run!") + utils.collect_ocs_logs( + "testcases", + ocs="ocs" == mg_target, + ocp="ocp" == mg_target, + status_failure=False, + silent=True, + output_file=True, + timeout=timeout, + ) + except Exception as ex: + failure_in_mg.append((mg_target, ex)) + log.error( + f"Failure in collecting {mg_target} must gather! Exception: {ex}" + ) + try: + utils.collect_pod_container_rpm_package("testcases") + except Exception as ex: + failure_in_mg.append(("rpm_package_info", ex)) + log.error(f"Failure in collectin RPM package info! Exception: {ex}") + if failure_in_mg: + raise failure_in_mg[0][1] request.addfinalizer(finalizer)