Skip to content

Commit

Permalink
Improvements for MG collections
Browse files Browse the repository at this point in the history
Fixes: #10526
Fixes: #11159

Several improvements in MG logs like preventing running MG over and over
when it's still failing or getting timeouted.

Collecting OCP logs for Ecosystem tests like upgrade decorated with purple
squad.

Not collect logs again at the end of execution on success when it was
collected at least once during execution by some failed test.

Signed-off-by: Petr Balogh <[email protected]>
  • Loading branch information
petr-balogh committed Jan 21, 2025
1 parent 1f886a1 commit a272f62
Show file tree
Hide file tree
Showing 8 changed files with 211 additions and 50 deletions.
4 changes: 3 additions & 1 deletion conf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ Reporting related config. (Do not store secret data in the repository!).
* `save_mem_report` - If True, test run memory report CSV file will be saved in `RUN["log_dir"]/stats_log_dir_<run_id>`
directory along with <test name>.peak_rss_table, <test name>.peak_vms_table reports. The option may be enforced by
exporting env variable: export SAVE_MEM_REPORT=true
* `max_mg_fail_attempts` - Maximum attempts to run MG commands to prevent
spending time on MG which is timeouting.

#### ENV_DATA

Expand Down Expand Up @@ -364,7 +366,7 @@ Upgrade related configuration data.
* `upgrade_logging_channel` - OCP logging channel to upgrade with
* `upgrade_ui` - Perform upgrade via UI (Not all the versions are supported, please look at the code)
* `upgrade_acm_version` - ACM version to which we have to upgrade
* `upgrade_acm_registry_image` - ACM Image tag from brew which should be used to upgrade
* `upgrade_acm_registry_image` - ACM Image tag from brew which should be used to upgrade
example: <brew_registry_url>/rh-osbs/iib:565330

#### AUTH
Expand Down
18 changes: 15 additions & 3 deletions ocs_ci/deployment/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,11 @@ def do_deploy_ocp(self, log_cli_level):
config.RUN["is_ocp_deployment_failed"] = True
logger.error(e)
if config.REPORTING["gather_on_deploy_failure"]:
collect_ocs_logs("deployment", ocs=False)
collect_ocs_logs(
"deployment",
ocs=False,
timeout=defaults.MUST_GATHER_TIMEOUT,
)
raise

def do_deploy_submariner(self):
Expand Down Expand Up @@ -381,8 +385,16 @@ def do_deploy_ocs(self):
if config.REPORTING["gather_on_deploy_failure"]:
# Let's do the collections separately to guard against one
# of them failing
collect_ocs_logs("deployment", ocs=False)
collect_ocs_logs("deployment", ocp=False)
collect_ocs_logs(
"deployment",
ocs=False,
timeout=defaults.MUST_GATHER_TIMEOUT,
)
collect_ocs_logs(
"deployment",
ocp=False,
timeout=defaults.MUST_GATHER_TIMEOUT,
)
raise
config.reset_ctx()
# Run ocs_install_verification here only in case of multicluster.
Expand Down
1 change: 1 addition & 0 deletions ocs_ci/framework/conf/default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ REPORTING:
gather_on_deploy_failure: true
collect_logs_on_success_run: False
rp_client_log_level: "ERROR"
max_mg_fail_attempts: 3

# This is the default information about environment.
ENV_DATA:
Expand Down
34 changes: 27 additions & 7 deletions ocs_ci/framework/pytest_customization/ocscilib.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
ClusterNameNotProvidedError,
ClusterPathNotProvidedError,
)
from ocs_ci.ocs import defaults
from ocs_ci.ocs.constants import (
CLUSTER_NAME_MAX_CHARACTERS,
CLUSTER_NAME_MIN_CHARACTERS,
Expand All @@ -31,7 +32,7 @@
)
from ocs_ci.ocs.cluster import check_clusters
from ocs_ci.ocs.resources.ocs import get_version_info
from ocs_ci.ocs.utils import collect_ocs_logs, collect_prometheus_metrics
from ocs_ci.ocs import utils
from ocs_ci.utility.utils import (
dump_config_to_file,
get_ceph_version,
Expand Down Expand Up @@ -729,34 +730,53 @@ def pytest_runtest_makereport(item, call):
and ocsci_config.RUN.get("cli_params").get("collect-logs")
and not ocsci_config.RUN.get("cli_params").get("deploy")
):
item_markers = {marker.name for marker in item.iter_markers()}
test_case_name = item.name
# TODO: We should avoid paths and rely on markers issue:
# https://github.com/red-hat-storage/ocs-ci/issues/10526
ocp_logs_collection = (
True
if any(
x in item.location[0]
for x in [
"ecosystem",
"e2e/performance",
"tests/functional/z_cluster",
]
)
else False
)
ocp_markers_to_collect = {
"performance",
"purple_squad",
}
if ocp_markers_to_collect & item_markers:
ocp_logs_collection = True
ocs_logs_collection = (
False
if any(x in item.location[0] for x in ["_ui", "must_gather"])
else True
)
mcg_logs_collection = (
True if any(x in item.location[0] for x in ["mcg", "ecosystem"]) else False
mcg_markers_to_collect = {
"mcg",
"purple_squad",
}
# For every failure in MG we are trying to extend next attempt by 20 minutes
adjusted_timeout = utils.mg_fail_count * 1200
timeout = ocsci_config.REPORTING.get(
"must_gather_timeout", defaults.MUST_GATHER_TIMEOUT + adjusted_timeout
)
log.info(f"Adjusted timeout for MG is {timeout} seconds")
mcg_logs_collection = bool(mcg_markers_to_collect & item_markers)
try:
if not ocsci_config.RUN.get("is_ocp_deployment_failed"):
collect_ocs_logs(
utils.collect_ocs_logs(
dir_name=test_case_name,
ocp=ocp_logs_collection,
ocs=ocs_logs_collection,
mcg=mcg_logs_collection,
silent=True,
output_file=True,
skip_after_max_fail=True,
timeout=timeout,
)
except Exception:
log.exception("Failed to collect OCS logs")
Expand All @@ -770,7 +790,7 @@ def pytest_runtest_makereport(item, call):
metrics = item.get_closest_marker("gather_metrics_on_fail").args
try:
threading_lock = call.getfixturevalue("threading_lock")
collect_prometheus_metrics(
utils.collect_prometheus_metrics(
metrics,
f"{item.name}-{call.when}",
call.start,
Expand Down
1 change: 1 addition & 0 deletions ocs_ci/ocs/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@
# Must-gather:
MUST_GATHER_UPSTREAM_IMAGE = "quay.io/ocs-dev/ocs-must-gather"
MUST_GATHER_UPSTREAM_TAG = "latest"
MUST_GATHER_TIMEOUT = "3600"

# CrushDeviceClass
CRUSH_DEVICE_CLASS = "ssd"
Expand Down
Loading

0 comments on commit a272f62

Please sign in to comment.