Improvements for MG collections

Fixes: #10526 Fixes: #11159 Several improvements in MG logs like preventing running MG over and over when it's still failing or getting timeouted. Collecting OCP logs for Ecosystem tests like upgrade decorated with purple squad. Not collect logs again at the end of execution on success when it was collected at least once during execution by some failed test. Signed-off-by: Petr Balogh <[email protected]>
red-hat-storage · Jan 21, 2025 · 996d8d2 · 996d8d2
1 parent 73b2443
commit 996d8d2
Show file tree

Hide file tree

Showing 7 changed files with 180 additions and 38 deletions.
diff --git a/conf/README.md b/conf/README.md
@@ -183,6 +183,8 @@ Reporting related config. (Do not store secret data in the repository!).
 * `save_mem_report` - If True, test run memory report CSV file will be saved in `RUN["log_dir"]/stats_log_dir_<run_id>`
   directory along with <test name>.peak_rss_table, <test name>.peak_vms_table reports. The option may be enforced by
   exporting env variable: export SAVE_MEM_REPORT=true
+* `max_mg_fail_attempts` - Maximum attempts to run MG commands to prevent
+  spending time on MG which is timeouting.
 
 #### ENV_DATA
 
@@ -364,7 +366,7 @@ Upgrade related configuration data.
 * `upgrade_logging_channel` - OCP logging channel to upgrade with
 * `upgrade_ui` - Perform upgrade via UI (Not all the versions are supported, please look at the code)
 * `upgrade_acm_version` - ACM version to which we have to upgrade
-* `upgrade_acm_registry_image` - ACM Image tag from brew which should be used to upgrade 
+* `upgrade_acm_registry_image` - ACM Image tag from brew which should be used to upgrade
 example: <brew_registry_url>/rh-osbs/iib:565330
 
 #### AUTH

diff --git a/ocs_ci/deployment/deployment.py b/ocs_ci/deployment/deployment.py
@@ -231,7 +231,7 @@ def do_deploy_ocp(self, log_cli_level):
                     config.RUN["is_ocp_deployment_failed"] = True
                     logger.error(e)
                     if config.REPORTING["gather_on_deploy_failure"]:
-                        collect_ocs_logs("deployment", ocs=False)
+                        collect_ocs_logs("deployment", ocs=False, timeout=3600)
                     raise
 
     def do_deploy_submariner(self):
@@ -381,8 +381,8 @@ def do_deploy_ocs(self):
                     if config.REPORTING["gather_on_deploy_failure"]:
                         # Let's do the collections separately to guard against one
                         # of them failing
-                        collect_ocs_logs("deployment", ocs=False)
-                        collect_ocs_logs("deployment", ocp=False)
+                        collect_ocs_logs("deployment", ocs=False, timeout=3600)
+                        collect_ocs_logs("deployment", ocp=False, timeout=3600)
                     raise
             config.reset_ctx()
             # Run ocs_install_verification here only in case of multicluster.

diff --git a/ocs_ci/framework/conf/default_config.yaml b/ocs_ci/framework/conf/default_config.yaml
@@ -141,6 +141,7 @@ REPORTING:
   gather_on_deploy_failure: true
   collect_logs_on_success_run: False
   rp_client_log_level: "ERROR"
+  max_mg_fail_attempts: 3
 
 # This is the default information about environment.
 ENV_DATA:

diff --git a/ocs_ci/framework/pytest_customization/ocscilib.py b/ocs_ci/framework/pytest_customization/ocscilib.py
@@ -31,7 +31,7 @@
 )
 from ocs_ci.ocs.cluster import check_clusters
 from ocs_ci.ocs.resources.ocs import get_version_info
-from ocs_ci.ocs.utils import collect_ocs_logs, collect_prometheus_metrics
+from ocs_ci.ocs.utils import collect_ocs_logs, collect_prometheus_metrics, mg_fail_count
 from ocs_ci.utility.utils import (
     dump_config_to_file,
     get_ceph_version,
@@ -729,34 +729,53 @@ def pytest_runtest_makereport(item, call):
         and ocsci_config.RUN.get("cli_params").get("collect-logs")
         and not ocsci_config.RUN.get("cli_params").get("deploy")
     ):
+        item_markers = {marker.name for marker in item.iter_markers()}
         test_case_name = item.name
+        # TODO: We should avoid paths and rely on markers issue:
+        # https://github.com/red-hat-storage/ocs-ci/issues/10526
         ocp_logs_collection = (
             True
             if any(
                 x in item.location[0]
                 for x in [
-                    "ecosystem",
-                    "e2e/performance",
                     "tests/functional/z_cluster",
                 ]
             )
             else False
         )
+        ocp_markers_to_collect = {
+            "performance",
+            "purple_squad",
+        }
+        if ocp_markers_to_collect & item_markers:
+            ocp_logs_collection = True
         ocs_logs_collection = (
             False
             if any(x in item.location[0] for x in ["_ui", "must_gather"])
             else True
         )
-        mcg_logs_collection = (
-            True if any(x in item.location[0] for x in ["mcg", "ecosystem"]) else False
+        mcg_markers_to_collect = {
+            "mcg",
+            "purple_squad",
+        }
+        # For every failure in MG we are trying to extend next attempt by 20 minutes
+        adjusted_timeout = mg_fail_count * 1200
+        timeout = ocsci_config.REPORTING.get(
+            "must_gather_timeout", 3600 + adjusted_timeout
         )
+        log.info(f"Adjusted timeout for MG is {timeout} seconds")
+        mcg_logs_collection = bool(mcg_markers_to_collect & item_markers)
         try:
             if not ocsci_config.RUN.get("is_ocp_deployment_failed"):
                 collect_ocs_logs(
                     dir_name=test_case_name,
                     ocp=ocp_logs_collection,
                     ocs=ocs_logs_collection,
                     mcg=mcg_logs_collection,
+                    silent=True,
+                    output_file=True,
+                    skip_after_max_fail=True,
+                    timeout=timeout,
                 )
         except Exception:
             log.exception("Failed to collect OCS logs")

diff --git a/ocs_ci/ocs/utils.py b/ocs_ci/ocs/utils.py
@@ -4,6 +4,7 @@
 import os
 import pickle
 import re
+import threading
 import time
 import traceback
 import subprocess
@@ -56,6 +57,11 @@
 
 
 log = logging.getLogger(__name__)
+mg_fail_count = 0
+mg_last_fail = None
+mg_collected_logs = 0
+mg_collected_types = set()
+mg_lock = threading.Lock()
 
 
 def create_ceph_nodes(cluster_conf, inventory, osp_cred, run_id, instances_name=None):
@@ -933,7 +939,16 @@ def apply_oc_resource(
     occli.apply(cfg_file)
 
 
-def run_must_gather(log_dir_path, image, command=None, cluster_config=None):
+def run_must_gather(
+    log_dir_path,
+    image,
+    command=None,
+    cluster_config=None,
+    silent=False,
+    output_file=None,
+    skip_after_max_fail=False,
+    timeout=2700,
+):
     """
     Runs the must-gather tool against the cluster
 
@@ -942,56 +957,67 @@ def run_must_gather(log_dir_path, image, command=None, cluster_config=None):
         image (str): must-gather image registry path
         command (str): optional command to execute within the must-gather image
         cluster_config (MultiClusterConfig): Holds specifc cluster config object in case of multicluster
+        silent (bool): True if silent mode
+        output_file (bool): True if direct whole output to file instead of printing it out to log (apply
+          only if silent is True).
+        skip_after_max_fail (bool): When max number failed attempts to collect MG reached, will skip
+            MG collection.
+        timeout (int): Max timeout to wait for MG to complete before aborting the MG execution.
 
     Returns:
         mg_output (str): must-gather cli output
 
     """
-    # Must-gather has many changes on 4.6 which add more time to the collection.
-    # https://github.com/red-hat-storage/ocs-ci/issues/3240
+    global mg_fail_count, mg_last_fail, mg_collected_logs
+
+    max_mg_fail_attempts = config.REPORTING.get("max_mg_fail_attempts")
+    if skip_after_max_fail:
+        with mg_lock:
+            if mg_fail_count > max_mg_fail_attempts:
+                log.warning(
+                    f"MG collection is skipped because MG already failed {mg_fail_count} times!"
+                    f" Last error occurred at: {mg_last_fail}"
+                )
+                return
     if not cluster_config:
         cluster_config = ocsci_config
     mg_output = ""
-    ocs_version = version.get_semantic_ocs_version_from_config()
-    if ocs_version >= version.VERSION_4_10:
-        timeout = 2100
-    elif ocs_version >= version.VERSION_4_6:
-        timeout = 1500
-    else:
-        timeout = 600
-
-    must_gather_timeout = cluster_config.REPORTING.get("must_gather_timeout", timeout)
 
+    timestamp = time.time()
     log.info(f"Must gather image: {image} will be used.")
     create_directory_path(log_dir_path)
     cmd = f"adm must-gather --image={image} --dest-dir={log_dir_path}"
     if command:
         cmd += f" -- {command}"
 
     log.info(f"OCS logs will be placed in location {log_dir_path}")
+    if output_file:
+        output_file = os.path.join(log_dir_path, f"mg_output_{timestamp}.log")
+        log.info(f"Must gather std error log will be placed in: {output_file}")
     occli = OCP()
     try:
         mg_output = occli.exec_oc_cmd(
             cmd,
             out_yaml_format=False,
-            timeout=must_gather_timeout,
+            timeout=timeout,
             cluster_config=cluster_config,
+            silent=silent,
+            output_file=output_file,
         )
         if config.DEPLOYMENT["external_mode"]:
             collect_ceph_external(path=log_dir_path)
-    except CommandFailed as ex:
-        log.error(
-            f"Failed during must gather logs! Error: {ex}"
-            f"Must-Gather Output: {mg_output}"
-        )
+        with mg_lock:
+            mg_collected_logs += 1
+    except (CommandFailed, TimeoutExpired) as ex:
+        log.error(f"Failed during must gather logs! Error: {ex}")
+        with mg_lock:
+            mg_fail_count += 1
+            mg_last_fail = datetime.datetime.now()
+
+        if mg_output:
+            log.error(f"Must-Gather Output: {mg_output}")
         export_mg_pods_logs(log_dir_path=log_dir_path)
 
-    except TimeoutExpired as ex:
-        log.error(
-            f"Failed during must gather logs! Error: {ex}"
-            f"Must-Gather Output: {mg_output}"
-        )
-        export_mg_pods_logs(log_dir_path=log_dir_path)
     return mg_output
 
 
@@ -1176,11 +1202,16 @@ def _collect_ocs_logs(
     mcg=False,
     status_failure=True,
     ocs_flags=None,
+    silent=False,
+    output_file=None,
+    skip_after_max_fail=False,
+    timeout=2700,
 ):
     """
     This function runs in thread
 
     """
+    global mg_collected_types
     log.info(
         (
             f"RUNNING IN CTX: {cluster_config.ENV_DATA['cluster_name']} RUNID: = {cluster_config.RUN['run_id']}"
@@ -1232,7 +1263,12 @@ def _collect_ocs_logs(
             ocs_must_gather_image_and_tag,
             cluster_config=cluster_config,
             command=ocs_flags,
+            silent=silent,
+            output_file=output_file,
+            skip_after_max_fail=skip_after_max_fail,
+            timeout=timeout,
         )
+        mg_collected_types.add("ocs")
         if (
             ocsci_config.DEPLOYMENT.get("disconnected")
             and "cannot stat 'jq'" in mg_output
@@ -1246,14 +1282,23 @@ def _collect_ocs_logs(
         if cluster_config.DEPLOYMENT.get("disconnected"):
             ocp_must_gather_image = mirror_image(ocp_must_gather_image)
         run_must_gather(
-            ocp_log_dir_path, ocp_must_gather_image, cluster_config=cluster_config
+            ocp_log_dir_path,
+            ocp_must_gather_image,
+            cluster_config=cluster_config,
+            output_file=output_file,
+            skip_after_max_fail=skip_after_max_fail,
+            timeout=timeout,
         )
         run_must_gather(
             ocp_log_dir_path,
             ocp_must_gather_image,
             "/usr/bin/gather_service_logs worker",
             cluster_config=cluster_config,
+            output_file=output_file,
+            skip_after_max_fail=skip_after_max_fail,
+            timeout=timeout,
         )
+        mg_collected_types.add("ocp")
     if mcg:
         counter = 0
         while counter < 5:
@@ -1266,6 +1311,7 @@ def _collect_ocs_logs(
                 ):
                     break
                 collect_noobaa_db_dump(log_dir_path, cluster_config)
+                mg_collected_types.add("mcg")
                 break
             except CommandFailed as ex:
                 log.error(f"Failed to dump noobaa DB! Error: {ex}")
@@ -1327,7 +1373,16 @@ def _collect_ocs_logs(
 
 
 def collect_ocs_logs(
-    dir_name, ocp=True, ocs=True, mcg=False, status_failure=True, ocs_flags=None
+    dir_name,
+    ocp=True,
+    ocs=True,
+    mcg=False,
+    status_failure=True,
+    ocs_flags=None,
+    silent=False,
+    output_file=None,
+    skip_after_max_fail=False,
+    timeout=2700,
 ):
     """
     Collects OCS logs
@@ -1341,6 +1396,12 @@ def collect_ocs_logs(
         status_failure (bool): Whether the collection is after success or failure,
             allows better naming for folders under logs directory
         ocs_flags (str): flags to ocs must gather command for example ["-- /usr/bin/gather -cs"]
+        silent (bool): True if silent mode
+        output_file (bool): True if direct whole output to file instead of printing it out to log (apply
+            only if silent is True).
+        skip_after_max_fail (bool): When max number failed attempts to collect MG reached, will skip
+            MG collection.
+        timeout (int): Max timeout to wait for MG to complete before aborting the MG execution.
 
     """
     results = list()
@@ -1357,6 +1418,10 @@ def collect_ocs_logs(
                         mcg=False,
                         status_failure=status_failure,
                         ocs_flags=ocs_flags,
+                        silent=silent,
+                        output_file=output_file,
+                        skip_after_max_fail=skip_after_max_fail,
+                        timeout=timeout,
                     )
                 )
             if ocs:
@@ -1370,6 +1435,10 @@ def collect_ocs_logs(
                         mcg=False,
                         status_failure=status_failure,
                         ocs_flags=ocs_flags,
+                        silent=silent,
+                        output_file=output_file,
+                        skip_after_max_fail=skip_after_max_fail,
+                        timeout=timeout,
                     )
                 )
             if mcg:
@@ -1383,6 +1452,10 @@ def collect_ocs_logs(
                         mcg=mcg,
                         status_failure=status_failure,
                         ocs_flags=ocs_flags,
+                        silent=silent,
+                        output_file=output_file,
+                        skip_after_max_fail=skip_after_max_fail,
+                        timeout=timeout,
                     )
                 )