Skip to content

Commit 8d56cdd

Browse files
committed
Improvements for MG collections
Fixes: #10526 Fixes: #11159 Several improvements in MG logs like preventing running MG over and over when it's still failing or getting timeouted. Collecting OCP logs for Ecosystem tests like upgrade decorated with purple squad. Not collect logs again at the end of execution on success when it was collected at least once during execution by some failed test. Signed-off-by: Petr Balogh <[email protected]>
1 parent 1f886a1 commit 8d56cdd

File tree

8 files changed

+211
-50
lines changed

8 files changed

+211
-50
lines changed

conf/README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ Reporting related config. (Do not store secret data in the repository!).
183183
* `save_mem_report` - If True, test run memory report CSV file will be saved in `RUN["log_dir"]/stats_log_dir_<run_id>`
184184
directory along with <test name>.peak_rss_table, <test name>.peak_vms_table reports. The option may be enforced by
185185
exporting env variable: export SAVE_MEM_REPORT=true
186+
* `max_mg_fail_attempts` - Maximum attempts to run MG commands to prevent
187+
spending time on MG which is timeouting.
186188

187189
#### ENV_DATA
188190

@@ -364,7 +366,7 @@ Upgrade related configuration data.
364366
* `upgrade_logging_channel` - OCP logging channel to upgrade with
365367
* `upgrade_ui` - Perform upgrade via UI (Not all the versions are supported, please look at the code)
366368
* `upgrade_acm_version` - ACM version to which we have to upgrade
367-
* `upgrade_acm_registry_image` - ACM Image tag from brew which should be used to upgrade
369+
* `upgrade_acm_registry_image` - ACM Image tag from brew which should be used to upgrade
368370
example: <brew_registry_url>/rh-osbs/iib:565330
369371

370372
#### AUTH

ocs_ci/deployment/deployment.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,11 @@ def do_deploy_ocp(self, log_cli_level):
231231
config.RUN["is_ocp_deployment_failed"] = True
232232
logger.error(e)
233233
if config.REPORTING["gather_on_deploy_failure"]:
234-
collect_ocs_logs("deployment", ocs=False)
234+
collect_ocs_logs(
235+
"deployment",
236+
ocs=False,
237+
timeout=defaults.MUST_GATHER_TIMEOUT,
238+
)
235239
raise
236240

237241
def do_deploy_submariner(self):
@@ -381,8 +385,16 @@ def do_deploy_ocs(self):
381385
if config.REPORTING["gather_on_deploy_failure"]:
382386
# Let's do the collections separately to guard against one
383387
# of them failing
384-
collect_ocs_logs("deployment", ocs=False)
385-
collect_ocs_logs("deployment", ocp=False)
388+
collect_ocs_logs(
389+
"deployment",
390+
ocs=False,
391+
timeout=defaults.MUST_GATHER_TIMEOUT,
392+
)
393+
collect_ocs_logs(
394+
"deployment",
395+
ocp=False,
396+
timeout=defaults.MUST_GATHER_TIMEOUT,
397+
)
386398
raise
387399
config.reset_ctx()
388400
# Run ocs_install_verification here only in case of multicluster.

ocs_ci/framework/conf/default_config.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ REPORTING:
141141
gather_on_deploy_failure: true
142142
collect_logs_on_success_run: False
143143
rp_client_log_level: "ERROR"
144+
max_mg_fail_attempts: 3
144145

145146
# This is the default information about environment.
146147
ENV_DATA:

ocs_ci/framework/pytest_customization/ocscilib.py

+27-7
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
ClusterNameNotProvidedError,
2121
ClusterPathNotProvidedError,
2222
)
23+
from ocs_ci.ocs import defaults
2324
from ocs_ci.ocs.constants import (
2425
CLUSTER_NAME_MAX_CHARACTERS,
2526
CLUSTER_NAME_MIN_CHARACTERS,
@@ -31,7 +32,7 @@
3132
)
3233
from ocs_ci.ocs.cluster import check_clusters
3334
from ocs_ci.ocs.resources.ocs import get_version_info
34-
from ocs_ci.ocs.utils import collect_ocs_logs, collect_prometheus_metrics
35+
from ocs_ci.ocs import utils
3536
from ocs_ci.utility.utils import (
3637
dump_config_to_file,
3738
get_ceph_version,
@@ -729,34 +730,53 @@ def pytest_runtest_makereport(item, call):
729730
and ocsci_config.RUN.get("cli_params").get("collect-logs")
730731
and not ocsci_config.RUN.get("cli_params").get("deploy")
731732
):
733+
item_markers = {marker.name for marker in item.iter_markers()}
732734
test_case_name = item.name
735+
# TODO: We should avoid paths and rely on markers issue:
736+
# https://github.com/red-hat-storage/ocs-ci/issues/10526
733737
ocp_logs_collection = (
734738
True
735739
if any(
736740
x in item.location[0]
737741
for x in [
738-
"ecosystem",
739-
"e2e/performance",
740742
"tests/functional/z_cluster",
741743
]
742744
)
743745
else False
744746
)
747+
ocp_markers_to_collect = {
748+
"performance",
749+
"purple_squad",
750+
}
751+
if ocp_markers_to_collect & item_markers:
752+
ocp_logs_collection = True
745753
ocs_logs_collection = (
746754
False
747755
if any(x in item.location[0] for x in ["_ui", "must_gather"])
748756
else True
749757
)
750-
mcg_logs_collection = (
751-
True if any(x in item.location[0] for x in ["mcg", "ecosystem"]) else False
758+
mcg_markers_to_collect = {
759+
"mcg",
760+
"purple_squad",
761+
}
762+
# For every failure in MG we are trying to extend next attempt by 20 minutes
763+
adjusted_timeout = utils.mg_fail_count * 1200
764+
timeout = ocsci_config.REPORTING.get(
765+
"must_gather_timeout", defaults.MUST_GATHER_TIMEOUT + adjusted_timeout
752766
)
767+
log.info(f"Adjusted timeout for MG is {timeout} seconds")
768+
mcg_logs_collection = bool(mcg_markers_to_collect & item_markers)
753769
try:
754770
if not ocsci_config.RUN.get("is_ocp_deployment_failed"):
755-
collect_ocs_logs(
771+
utils.collect_ocs_logs(
756772
dir_name=test_case_name,
757773
ocp=ocp_logs_collection,
758774
ocs=ocs_logs_collection,
759775
mcg=mcg_logs_collection,
776+
silent=True,
777+
output_file=True,
778+
skip_after_max_fail=True,
779+
timeout=timeout,
760780
)
761781
except Exception:
762782
log.exception("Failed to collect OCS logs")
@@ -770,7 +790,7 @@ def pytest_runtest_makereport(item, call):
770790
metrics = item.get_closest_marker("gather_metrics_on_fail").args
771791
try:
772792
threading_lock = call.getfixturevalue("threading_lock")
773-
collect_prometheus_metrics(
793+
utils.collect_prometheus_metrics(
774794
metrics,
775795
f"{item.name}-{call.when}",
776796
call.start,

ocs_ci/ocs/defaults.py

+1
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@
168168
# Must-gather:
169169
MUST_GATHER_UPSTREAM_IMAGE = "quay.io/ocs-dev/ocs-must-gather"
170170
MUST_GATHER_UPSTREAM_TAG = "latest"
171+
MUST_GATHER_TIMEOUT = 3600
171172

172173
# CrushDeviceClass
173174
CRUSH_DEVICE_CLASS = "ssd"

0 commit comments

Comments
 (0)