Skip to content

Commit

Permalink
Improvements for MG collections
Browse files Browse the repository at this point in the history
Fixes: #10526
Fixes: #11159

Several improvements in MG logs like preventing running MG over and over
when it's still failing or getting timeouted.

Collecting OCP logs for Ecosystem tests like upgrade decorated with purple
squad.

Not collect logs again at the end of execution on success when it was
collected at least once during execution by some failed test.

Signed-off-by: Petr Balogh <[email protected]>
  • Loading branch information
petr-balogh committed Jan 21, 2025
1 parent 73b2443 commit 996d8d2
Show file tree
Hide file tree
Showing 7 changed files with 180 additions and 38 deletions.
4 changes: 3 additions & 1 deletion conf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ Reporting related config. (Do not store secret data in the repository!).
* `save_mem_report` - If True, test run memory report CSV file will be saved in `RUN["log_dir"]/stats_log_dir_<run_id>`
directory along with <test name>.peak_rss_table, <test name>.peak_vms_table reports. The option may be enforced by
exporting env variable: export SAVE_MEM_REPORT=true
* `max_mg_fail_attempts` - Maximum attempts to run MG commands to prevent
spending time on MG which is timeouting.

#### ENV_DATA

Expand Down Expand Up @@ -364,7 +366,7 @@ Upgrade related configuration data.
* `upgrade_logging_channel` - OCP logging channel to upgrade with
* `upgrade_ui` - Perform upgrade via UI (Not all the versions are supported, please look at the code)
* `upgrade_acm_version` - ACM version to which we have to upgrade
* `upgrade_acm_registry_image` - ACM Image tag from brew which should be used to upgrade
* `upgrade_acm_registry_image` - ACM Image tag from brew which should be used to upgrade
example: <brew_registry_url>/rh-osbs/iib:565330

#### AUTH
Expand Down
6 changes: 3 additions & 3 deletions ocs_ci/deployment/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def do_deploy_ocp(self, log_cli_level):
config.RUN["is_ocp_deployment_failed"] = True
logger.error(e)
if config.REPORTING["gather_on_deploy_failure"]:
collect_ocs_logs("deployment", ocs=False)
collect_ocs_logs("deployment", ocs=False, timeout=3600)
raise

def do_deploy_submariner(self):
Expand Down Expand Up @@ -381,8 +381,8 @@ def do_deploy_ocs(self):
if config.REPORTING["gather_on_deploy_failure"]:
# Let's do the collections separately to guard against one
# of them failing
collect_ocs_logs("deployment", ocs=False)
collect_ocs_logs("deployment", ocp=False)
collect_ocs_logs("deployment", ocs=False, timeout=3600)
collect_ocs_logs("deployment", ocp=False, timeout=3600)
raise
config.reset_ctx()
# Run ocs_install_verification here only in case of multicluster.
Expand Down
1 change: 1 addition & 0 deletions ocs_ci/framework/conf/default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ REPORTING:
gather_on_deploy_failure: true
collect_logs_on_success_run: False
rp_client_log_level: "ERROR"
max_mg_fail_attempts: 3

# This is the default information about environment.
ENV_DATA:
Expand Down
29 changes: 24 additions & 5 deletions ocs_ci/framework/pytest_customization/ocscilib.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
)
from ocs_ci.ocs.cluster import check_clusters
from ocs_ci.ocs.resources.ocs import get_version_info
from ocs_ci.ocs.utils import collect_ocs_logs, collect_prometheus_metrics
from ocs_ci.ocs.utils import collect_ocs_logs, collect_prometheus_metrics, mg_fail_count
from ocs_ci.utility.utils import (
dump_config_to_file,
get_ceph_version,
Expand Down Expand Up @@ -729,34 +729,53 @@ def pytest_runtest_makereport(item, call):
and ocsci_config.RUN.get("cli_params").get("collect-logs")
and not ocsci_config.RUN.get("cli_params").get("deploy")
):
item_markers = {marker.name for marker in item.iter_markers()}
test_case_name = item.name
# TODO: We should avoid paths and rely on markers issue:
# https://github.com/red-hat-storage/ocs-ci/issues/10526
ocp_logs_collection = (
True
if any(
x in item.location[0]
for x in [
"ecosystem",
"e2e/performance",
"tests/functional/z_cluster",
]
)
else False
)
ocp_markers_to_collect = {
"performance",
"purple_squad",
}
if ocp_markers_to_collect & item_markers:
ocp_logs_collection = True
ocs_logs_collection = (
False
if any(x in item.location[0] for x in ["_ui", "must_gather"])
else True
)
mcg_logs_collection = (
True if any(x in item.location[0] for x in ["mcg", "ecosystem"]) else False
mcg_markers_to_collect = {
"mcg",
"purple_squad",
}
# For every failure in MG we are trying to extend next attempt by 20 minutes
adjusted_timeout = mg_fail_count * 1200
timeout = ocsci_config.REPORTING.get(
"must_gather_timeout", 3600 + adjusted_timeout
)
log.info(f"Adjusted timeout for MG is {timeout} seconds")
mcg_logs_collection = bool(mcg_markers_to_collect & item_markers)
try:
if not ocsci_config.RUN.get("is_ocp_deployment_failed"):
collect_ocs_logs(
dir_name=test_case_name,
ocp=ocp_logs_collection,
ocs=ocs_logs_collection,
mcg=mcg_logs_collection,
silent=True,
output_file=True,
skip_after_max_fail=True,
timeout=timeout,
)
except Exception:
log.exception("Failed to collect OCS logs")
Expand Down
125 changes: 99 additions & 26 deletions ocs_ci/ocs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import pickle
import re
import threading
import time
import traceback
import subprocess
Expand Down Expand Up @@ -56,6 +57,11 @@


log = logging.getLogger(__name__)
mg_fail_count = 0
mg_last_fail = None
mg_collected_logs = 0
mg_collected_types = set()
mg_lock = threading.Lock()


def create_ceph_nodes(cluster_conf, inventory, osp_cred, run_id, instances_name=None):
Expand Down Expand Up @@ -933,7 +939,16 @@ def apply_oc_resource(
occli.apply(cfg_file)


def run_must_gather(log_dir_path, image, command=None, cluster_config=None):
def run_must_gather(
log_dir_path,
image,
command=None,
cluster_config=None,
silent=False,
output_file=None,
skip_after_max_fail=False,
timeout=2700,
):
"""
Runs the must-gather tool against the cluster
Expand All @@ -942,56 +957,67 @@ def run_must_gather(log_dir_path, image, command=None, cluster_config=None):
image (str): must-gather image registry path
command (str): optional command to execute within the must-gather image
cluster_config (MultiClusterConfig): Holds specifc cluster config object in case of multicluster
silent (bool): True if silent mode
output_file (bool): True if direct whole output to file instead of printing it out to log (apply
only if silent is True).
skip_after_max_fail (bool): When max number failed attempts to collect MG reached, will skip
MG collection.
timeout (int): Max timeout to wait for MG to complete before aborting the MG execution.
Returns:
mg_output (str): must-gather cli output
"""
# Must-gather has many changes on 4.6 which add more time to the collection.
# https://github.com/red-hat-storage/ocs-ci/issues/3240
global mg_fail_count, mg_last_fail, mg_collected_logs

max_mg_fail_attempts = config.REPORTING.get("max_mg_fail_attempts")
if skip_after_max_fail:
with mg_lock:
if mg_fail_count > max_mg_fail_attempts:
log.warning(
f"MG collection is skipped because MG already failed {mg_fail_count} times!"
f" Last error occurred at: {mg_last_fail}"
)
return
if not cluster_config:
cluster_config = ocsci_config
mg_output = ""
ocs_version = version.get_semantic_ocs_version_from_config()
if ocs_version >= version.VERSION_4_10:
timeout = 2100
elif ocs_version >= version.VERSION_4_6:
timeout = 1500
else:
timeout = 600

must_gather_timeout = cluster_config.REPORTING.get("must_gather_timeout", timeout)

timestamp = time.time()
log.info(f"Must gather image: {image} will be used.")
create_directory_path(log_dir_path)
cmd = f"adm must-gather --image={image} --dest-dir={log_dir_path}"
if command:
cmd += f" -- {command}"

log.info(f"OCS logs will be placed in location {log_dir_path}")
if output_file:
output_file = os.path.join(log_dir_path, f"mg_output_{timestamp}.log")
log.info(f"Must gather std error log will be placed in: {output_file}")
occli = OCP()
try:
mg_output = occli.exec_oc_cmd(
cmd,
out_yaml_format=False,
timeout=must_gather_timeout,
timeout=timeout,
cluster_config=cluster_config,
silent=silent,
output_file=output_file,
)
if config.DEPLOYMENT["external_mode"]:
collect_ceph_external(path=log_dir_path)
except CommandFailed as ex:
log.error(
f"Failed during must gather logs! Error: {ex}"
f"Must-Gather Output: {mg_output}"
)
with mg_lock:
mg_collected_logs += 1
except (CommandFailed, TimeoutExpired) as ex:
log.error(f"Failed during must gather logs! Error: {ex}")
with mg_lock:
mg_fail_count += 1
mg_last_fail = datetime.datetime.now()

if mg_output:
log.error(f"Must-Gather Output: {mg_output}")
export_mg_pods_logs(log_dir_path=log_dir_path)

except TimeoutExpired as ex:
log.error(
f"Failed during must gather logs! Error: {ex}"
f"Must-Gather Output: {mg_output}"
)
export_mg_pods_logs(log_dir_path=log_dir_path)
return mg_output


Expand Down Expand Up @@ -1176,11 +1202,16 @@ def _collect_ocs_logs(
mcg=False,
status_failure=True,
ocs_flags=None,
silent=False,
output_file=None,
skip_after_max_fail=False,
timeout=2700,
):
"""
This function runs in thread
"""
global mg_collected_types
log.info(
(
f"RUNNING IN CTX: {cluster_config.ENV_DATA['cluster_name']} RUNID: = {cluster_config.RUN['run_id']}"
Expand Down Expand Up @@ -1232,7 +1263,12 @@ def _collect_ocs_logs(
ocs_must_gather_image_and_tag,
cluster_config=cluster_config,
command=ocs_flags,
silent=silent,
output_file=output_file,
skip_after_max_fail=skip_after_max_fail,
timeout=timeout,
)
mg_collected_types.add("ocs")
if (
ocsci_config.DEPLOYMENT.get("disconnected")
and "cannot stat 'jq'" in mg_output
Expand All @@ -1246,14 +1282,23 @@ def _collect_ocs_logs(
if cluster_config.DEPLOYMENT.get("disconnected"):
ocp_must_gather_image = mirror_image(ocp_must_gather_image)
run_must_gather(
ocp_log_dir_path, ocp_must_gather_image, cluster_config=cluster_config
ocp_log_dir_path,
ocp_must_gather_image,
cluster_config=cluster_config,
output_file=output_file,
skip_after_max_fail=skip_after_max_fail,
timeout=timeout,
)
run_must_gather(
ocp_log_dir_path,
ocp_must_gather_image,
"/usr/bin/gather_service_logs worker",
cluster_config=cluster_config,
output_file=output_file,
skip_after_max_fail=skip_after_max_fail,
timeout=timeout,
)
mg_collected_types.add("ocp")
if mcg:
counter = 0
while counter < 5:
Expand All @@ -1266,6 +1311,7 @@ def _collect_ocs_logs(
):
break
collect_noobaa_db_dump(log_dir_path, cluster_config)
mg_collected_types.add("mcg")
break
except CommandFailed as ex:
log.error(f"Failed to dump noobaa DB! Error: {ex}")
Expand Down Expand Up @@ -1327,7 +1373,16 @@ def _collect_ocs_logs(


def collect_ocs_logs(
dir_name, ocp=True, ocs=True, mcg=False, status_failure=True, ocs_flags=None
dir_name,
ocp=True,
ocs=True,
mcg=False,
status_failure=True,
ocs_flags=None,
silent=False,
output_file=None,
skip_after_max_fail=False,
timeout=2700,
):
"""
Collects OCS logs
Expand All @@ -1341,6 +1396,12 @@ def collect_ocs_logs(
status_failure (bool): Whether the collection is after success or failure,
allows better naming for folders under logs directory
ocs_flags (str): flags to ocs must gather command for example ["-- /usr/bin/gather -cs"]
silent (bool): True if silent mode
output_file (bool): True if direct whole output to file instead of printing it out to log (apply
only if silent is True).
skip_after_max_fail (bool): When max number failed attempts to collect MG reached, will skip
MG collection.
timeout (int): Max timeout to wait for MG to complete before aborting the MG execution.
"""
results = list()
Expand All @@ -1357,6 +1418,10 @@ def collect_ocs_logs(
mcg=False,
status_failure=status_failure,
ocs_flags=ocs_flags,
silent=silent,
output_file=output_file,
skip_after_max_fail=skip_after_max_fail,
timeout=timeout,
)
)
if ocs:
Expand All @@ -1370,6 +1435,10 @@ def collect_ocs_logs(
mcg=False,
status_failure=status_failure,
ocs_flags=ocs_flags,
silent=silent,
output_file=output_file,
skip_after_max_fail=skip_after_max_fail,
timeout=timeout,
)
)
if mcg:
Expand All @@ -1383,6 +1452,10 @@ def collect_ocs_logs(
mcg=mcg,
status_failure=status_failure,
ocs_flags=ocs_flags,
silent=silent,
output_file=output_file,
skip_after_max_fail=skip_after_max_fail,
timeout=timeout,
)
)

Expand Down
Loading

0 comments on commit 996d8d2

Please sign in to comment.