From bcadc9ecd1914da860fb9e6c0a0716e4b216566e Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Sun, 23 Jun 2024 23:09:12 -0400 Subject: [PATCH 01/14] Run 8 and 16 node performance tests in parallel --- .../tests/performance_tests/test_openfoam.py | 44 ++++++++++++----- .../tests/performance_tests/test_starccm.py | 47 ++++++++++++++----- 2 files changed, 65 insertions(+), 26 deletions(-) diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam.py b/tests/integration-tests/tests/performance_tests/test_openfoam.py index 9689941300..d33d98fdab 100644 --- a/tests/integration-tests/tests/performance_tests/test_openfoam.py +++ b/tests/integration-tests/tests/performance_tests/test_openfoam.py @@ -1,4 +1,5 @@ import logging +from concurrent.futures.thread import ThreadPoolExecutor import pytest from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor @@ -33,6 +34,23 @@ def openfoam_installed(headnode): return False +def run_openfoam_test(remote_command_executor, test_datadir, number_of_nodes): + subspace_benchmarks_dir = "/shared/SubspaceBenchmarks" + logging.info(f"Submitting OpenFOAM job with {number_of_nodes} nodes") + remote_command_executor.run_remote_command( + f'bash openfoam.slurm.sh "{subspace_benchmarks_dir}" "{number_of_nodes}" 2>&1', + additional_files=[str(test_datadir / "openfoam.slurm.sh")], + timeout=OPENFOAM_JOB_TIMEOUT, + ) + perf_test_result = remote_command_executor.run_remote_script( + (str(test_datadir / "openfoam.results.sh")), hide=False + ) + output = perf_test_result.stdout.strip() + observed_value = int(output.split("\n")[-1].strip()) + logging.info(f"The elapsed time for {number_of_nodes} nodes is {observed_value} seconds") + return observed_value + + @pytest.mark.parametrize( "number_of_nodes", [[8, 16, 32]], @@ -59,19 +77,19 @@ def test_openfoam( ) logging.info("OpenFOAM Installed") performance_degradation = {} - subspace_benchmarks_dir = "/shared/SubspaceBenchmarks" - for node in number_of_nodes: - logging.info(f"Submitting OpenFOAM job with {node} nodes") - remote_command_executor.run_remote_command( - f'bash openfoam.slurm.sh "{subspace_benchmarks_dir}" "{node}" 2>&1', - additional_files=[str(test_datadir / "openfoam.slurm.sh")], - timeout=OPENFOAM_JOB_TIMEOUT, - ) - perf_test_result = remote_command_executor.run_remote_script( - (str(test_datadir / "openfoam.results.sh")), hide=False - ) - output = perf_test_result.stdout.strip() - observed_value = int(output.split("\n")[-1].strip()) + + # Run 8 and 16 node tests in parallel + with ThreadPoolExecutor(max_workers=2) as executor: + future_8 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 8) + future_16 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 16) + observed_value_8 = future_8.result() + observed_value_16 = future_16.result() + + # Run 32 node test + observed_value_32 = run_openfoam_test(remote_command_executor, test_datadir, 32) + + # Check results and log performance degradation + for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]): baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node] logging.info(f"The elapsed time for {node} nodes is {observed_value} seconds") percentage_difference = perf_test_difference(observed_value, baseline_value) diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py index 83dd9f5d9f..f43ce99fb6 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm.py +++ b/tests/integration-tests/tests/performance_tests/test_starccm.py @@ -1,5 +1,6 @@ import json import logging +from concurrent.futures import ThreadPoolExecutor import boto3 import pytest @@ -47,6 +48,24 @@ def starccm_installed(headnode): return False +def run_starccm_test(remote_command_executor, scheduler_commands, test_datadir, number_of_nodes, podkey, licpath): + num_of_tasks = number_of_nodes * TASK_VCPUS + result = remote_command_executor.run_remote_command( + f'sbatch --ntasks={num_of_tasks} starccm.slurm.sh "{podkey}" "{licpath}"', + additional_files=[str(test_datadir / "starccm.slurm.sh")], + ) + logging.info(f"Submitting StarCCM+ job with {number_of_nodes} nodes") + job_id = scheduler_commands.assert_job_submitted(result.stdout) + scheduler_commands.wait_job_completed(job_id, timeout=STARCCM_JOB_TIMEOUT) + scheduler_commands.assert_job_succeeded(job_id) + perf_test_result = remote_command_executor.run_remote_script( + (str(test_datadir / "starccm.results.sh")), args=[job_id], hide=False + ) + observed_value = float(perf_test_result.stdout) + logging.info(f"The elapsed time for {number_of_nodes} nodes is {observed_value} seconds") + return observed_value + + @pytest.mark.parametrize( "number_of_nodes", [[8, 16, 32]], @@ -88,21 +107,23 @@ def test_starccm( logging.info("StarCCM+ Installed") podkey, licpath = get_starccm_secrets(region) performance_degradation = {} - for node in number_of_nodes: - num_of_tasks = node * TASK_VCPUS - result = remote_command_executor.run_remote_command( - f'sbatch --ntasks={num_of_tasks} starccm.slurm.sh "{podkey}" "{licpath}"', - additional_files=[str(test_datadir / "starccm.slurm.sh")], + + # Run 8 and 16 node tests in parallel + with ThreadPoolExecutor(max_workers=2) as executor: + future_8 = executor.submit( + run_starccm_test, remote_command_executor, scheduler_commands, test_datadir, 8, podkey, licpath ) - logging.info(f"Submitting StarCCM+ job with {node} nodes") - job_id = scheduler_commands.assert_job_submitted(result.stdout) - scheduler_commands.wait_job_completed(job_id, timeout=STARCCM_JOB_TIMEOUT) - scheduler_commands.assert_job_succeeded(job_id) - perf_test_result = remote_command_executor.run_remote_script( - (str(test_datadir / "starccm.results.sh")), args=[job_id], hide=False + future_16 = executor.submit( + run_starccm_test, remote_command_executor, scheduler_commands, test_datadir, 16, podkey, licpath ) - observed_value = float(perf_test_result.stdout) - logging.info(f"The elapsed time for {node} nodes is {observed_value} seconds") + observed_value_8 = future_8.result() + observed_value_16 = future_16.result() + + # Run 32 node test + observed_value_32 = run_starccm_test(remote_command_executor, scheduler_commands, test_datadir, 32, podkey, licpath) + + # Check results and log performance degradation + for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]): baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node] percentage_difference = perf_test_difference(observed_value, baseline_value) if percentage_difference < 0: From f24e0736c894c35dbc541865d19581ea0708ce76 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 24 Jun 2024 09:38:03 -0400 Subject: [PATCH 02/14] Delete a duplicate logging info --- tests/integration-tests/tests/performance_tests/test_openfoam.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam.py b/tests/integration-tests/tests/performance_tests/test_openfoam.py index d33d98fdab..d22e6371de 100644 --- a/tests/integration-tests/tests/performance_tests/test_openfoam.py +++ b/tests/integration-tests/tests/performance_tests/test_openfoam.py @@ -91,7 +91,6 @@ def test_openfoam( # Check results and log performance degradation for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]): baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node] - logging.info(f"The elapsed time for {node} nodes is {observed_value} seconds") percentage_difference = perf_test_difference(observed_value, baseline_value) if percentage_difference < 0: outcome = "improvement" From b76afb9ca98d9c6f06b9d9dc10747ee1a6fc1603 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 24 Jun 2024 12:40:05 -0400 Subject: [PATCH 03/14] Copy additional files in advanced to avoid conflict when running 8 and 16 nodes tests in parallel --- .../tests/performance_tests/test_openfoam.py | 3 ++- .../tests/performance_tests/test_starccm.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam.py b/tests/integration-tests/tests/performance_tests/test_openfoam.py index d22e6371de..15e284db7b 100644 --- a/tests/integration-tests/tests/performance_tests/test_openfoam.py +++ b/tests/integration-tests/tests/performance_tests/test_openfoam.py @@ -39,7 +39,6 @@ def run_openfoam_test(remote_command_executor, test_datadir, number_of_nodes): logging.info(f"Submitting OpenFOAM job with {number_of_nodes} nodes") remote_command_executor.run_remote_command( f'bash openfoam.slurm.sh "{subspace_benchmarks_dir}" "{number_of_nodes}" 2>&1', - additional_files=[str(test_datadir / "openfoam.slurm.sh")], timeout=OPENFOAM_JOB_TIMEOUT, ) perf_test_result = remote_command_executor.run_remote_script( @@ -78,6 +77,8 @@ def test_openfoam( logging.info("OpenFOAM Installed") performance_degradation = {} + # Copy additional files in advanced to avoid conflict when running 8 and 16 nodes tests in parallel + remote_command_executor._copy_additional_files([str(test_datadir / "openfoam.slurm.sh")]) # Run 8 and 16 node tests in parallel with ThreadPoolExecutor(max_workers=2) as executor: future_8 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 8) diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py index f43ce99fb6..a98014a265 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm.py +++ b/tests/integration-tests/tests/performance_tests/test_starccm.py @@ -51,8 +51,7 @@ def starccm_installed(headnode): def run_starccm_test(remote_command_executor, scheduler_commands, test_datadir, number_of_nodes, podkey, licpath): num_of_tasks = number_of_nodes * TASK_VCPUS result = remote_command_executor.run_remote_command( - f'sbatch --ntasks={num_of_tasks} starccm.slurm.sh "{podkey}" "{licpath}"', - additional_files=[str(test_datadir / "starccm.slurm.sh")], + f'sbatch --ntasks={num_of_tasks} starccm.slurm.sh "{podkey}" "{licpath}"' ) logging.info(f"Submitting StarCCM+ job with {number_of_nodes} nodes") job_id = scheduler_commands.assert_job_submitted(result.stdout) @@ -108,6 +107,8 @@ def test_starccm( podkey, licpath = get_starccm_secrets(region) performance_degradation = {} + # Copy additional files in advanced to avoid conflict when running 8 and 16 nodes tests in parallel + remote_command_executor._copy_additional_files([str(test_datadir / "starccm.slurm.sh")]) # Run 8 and 16 node tests in parallel with ThreadPoolExecutor(max_workers=2) as executor: future_8 = executor.submit( From 66c628b97c6809776b558935507beb5e48005d23 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 25 Jun 2024 22:23:59 -0400 Subject: [PATCH 04/14] Group test_starccm and openfoam, and improve logging message output - Move duplicated codes to common file - Modify two tests' config files to make sure they are the same and can be used by both tests - Add dependencies.install.sh in openfoam testdir as well to avoid failures - Create a new shared_performance_test_cluster fixture, to group two tests in the same stack - Now, if openfoam and starccm are not installed, it will not log errors - Now, when percentage_difference is 0, will log matching baseline message --- .../tests/performance_tests/common.py | 27 ++++++++++++ .../tests/performance_tests/conftest.py | 41 ++++++++++++++++++ .../tests/performance_tests/test_openfoam.py | 34 ++++----------- .../test_openfoam/dependencies.install.sh | 8 ++++ .../test_openfoam/pcluster.config.yaml | 22 +++++++++- .../tests/performance_tests/test_starccm.py | 42 +++---------------- .../test_starccm/pcluster.config.yaml | 7 ++++ 7 files changed, 117 insertions(+), 64 deletions(-) create mode 100644 tests/integration-tests/tests/performance_tests/conftest.py create mode 100644 tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/dependencies.install.sh diff --git a/tests/integration-tests/tests/performance_tests/common.py b/tests/integration-tests/tests/performance_tests/common.py index 9e49f9dd85..ac7384875f 100644 --- a/tests/integration-tests/tests/performance_tests/common.py +++ b/tests/integration-tests/tests/performance_tests/common.py @@ -27,6 +27,7 @@ PYTEST_PARAMETERIZE_VALUES = [(NUM_COMPUTE_NODES, 1)] TEST_RUNNER_SCRIPT = "/shared/assets/workloads/scale-test/run-scale-test.sh" ROUND_UP_FACTOR = 100_000_000 +PERF_TEST_DIFFERENCE_TOLERANCE = 3 METRICS = [ dict(name="jobRunTime", unit="ms"), @@ -222,3 +223,29 @@ def write_results_to_output_dir( paths["baseline"]["statistics.json"], paths[candidate_configuration]["statistics.json"], ) + + +def perf_test_difference(observed_value, baseline_value): + percentage_difference = 100 * (observed_value - baseline_value) / baseline_value + return percentage_difference + + +def _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value): + percentage_difference = perf_test_difference(observed_value, baseline_value) + if percentage_difference < 0: + outcome = "improvement" + elif percentage_difference == 0: + outcome = "matching baseline" + elif percentage_difference <= PERF_TEST_DIFFERENCE_TOLERANCE: + outcome = "degradation (within tolerance)" + else: + outcome = "degradation (above tolerance)" + performance_degradation[node] = { + "baseline": baseline_value, + "observed": observed_value, + "percentage_difference": percentage_difference, + } + logging.info( + f"Nodes: {node}, Baseline: {baseline_value} seconds, Observed: {observed_value} seconds, " + f"Percentage difference: {percentage_difference}%, Outcome: {outcome}" + ) diff --git a/tests/integration-tests/tests/performance_tests/conftest.py b/tests/integration-tests/tests/performance_tests/conftest.py new file mode 100644 index 0000000000..fd2705a98d --- /dev/null +++ b/tests/integration-tests/tests/performance_tests/conftest.py @@ -0,0 +1,41 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import logging + +import boto3 +import pytest + +OSS_REQUIRING_EXTRA_DEPS = ["alinux2023", "rhel8", "rocky8"] +NUMBER_OF_NODES = [8, 16, 32] + + +@pytest.fixture(scope="session") +def shared_performance_test_cluster( + vpc_stack, pcluster_config_reader, clusters_factory, test_datadir, s3_bucket_factory +): + + def _shared_performance_test_cluster(instance, os, region, scheduler): + bucket_name = s3_bucket_factory() + s3 = boto3.client("s3") + s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh") + + cluster_config = pcluster_config_reader( + bucket_name=bucket_name, + install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS, + number_of_nodes=max(NUMBER_OF_NODES), + ) + cluster = clusters_factory(cluster_config) + logging.info("Cluster Created") + return cluster + + return _shared_performance_test_cluster diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam.py b/tests/integration-tests/tests/performance_tests/test_openfoam.py index 15e284db7b..0538db7f2b 100644 --- a/tests/integration-tests/tests/performance_tests/test_openfoam.py +++ b/tests/integration-tests/tests/performance_tests/test_openfoam.py @@ -4,6 +4,8 @@ import pytest from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor +from tests.performance_tests.common import _log_output_performance_difference + # timeout in seconds OPENFOAM_INSTALLATION_TIMEOUT = 300 OPENFOAM_JOB_TIMEOUT = 5400 # Takes long time because during the first time, it's not only execute the job but also @@ -17,20 +19,15 @@ "rhel8": {8: 742, 16: 376, 32: 185}, # v3.6.0 just a placeholder, RHEL8 not supported "rocky8": {8: 742, 16: 376, 32: 185}, # v3.8.0 just a placeholder, Rocky8 not supported } -PERF_TEST_DIFFERENCE_TOLERANCE = 3 - - -def perf_test_difference(observed_value, baseline_value): - percentage_difference = 100 * (observed_value - baseline_value) / baseline_value - return percentage_difference def openfoam_installed(headnode): cmd = '[ -d "/shared/SubspaceBenchmarks" ]' try: - headnode.run_remote_command(cmd) + headnode.run_remote_command(cmd, log_error=False) return True except RemoteCommandExecutionError: + logging.info("OpenFOAM is not installed on the head node.") return False @@ -61,13 +58,11 @@ def test_openfoam( region, scheduler, pcluster_config_reader, - clusters_factory, + shared_performance_test_cluster, number_of_nodes, test_datadir, ): - cluster_config = pcluster_config_reader(number_of_nodes=max(number_of_nodes)) - cluster = clusters_factory(cluster_config) - logging.info("Cluster Created") + cluster = shared_performance_test_cluster(instance, os, region, scheduler) remote_command_executor = RemoteCommandExecutor(cluster) if not openfoam_installed(remote_command_executor): logging.info("Installing OpenFOAM") @@ -92,22 +87,7 @@ def test_openfoam( # Check results and log performance degradation for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]): baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node] - percentage_difference = perf_test_difference(observed_value, baseline_value) - if percentage_difference < 0: - outcome = "improvement" - elif percentage_difference <= PERF_TEST_DIFFERENCE_TOLERANCE: - outcome = "degradation (within tolerance)" - else: - outcome = "degradation (above tolerance)" - performance_degradation[node] = { - "baseline": baseline_value, - "observed": observed_value, - "percentage_difference": percentage_difference, - } - logging.info( - f"Nodes: {node}, Baseline: {baseline_value} seconds, Observed: {observed_value} seconds, " - f"Percentage difference: {percentage_difference}%, Outcome: {outcome}" - ) + _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value) if performance_degradation: pytest.fail(f"Performance degradation detected: {performance_degradation}") diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/dependencies.install.sh b/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/dependencies.install.sh new file mode 100644 index 0000000000..e109f8583d --- /dev/null +++ b/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/dependencies.install.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# This script installs the necessary software stack for StarCCM+. +# Note: The same cluster is shared by both test_openfoam and test_starccm. +# The cluster will be created by whichever test (test_openfoam or test_starccm) is executed first. +# If test_openfoam is executed first, it will also need to install the required dependencies. +set -ex + +sudo yum install -y libnsl diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml index bf0ea9a2e5..0fce058376 100644 --- a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml +++ b/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml @@ -16,12 +16,23 @@ HeadNode: - BucketName: performance-tests-resources-for-parallelcluster KeyName: openfoam/* EnableWriteAccess: false + - BucketName: performance-tests-resources-for-parallelcluster + KeyName: starccm/* + EnableWriteAccess: false +{% if install_extra_deps %} + - BucketName: {{ bucket_name }} + KeyName: scripts/dependencies.install.sh + EnableWriteAccess: false + CustomActions: + OnNodeConfigured: + Script: s3://{{ bucket_name }}/scripts/dependencies.install.sh +{% endif %} Scheduling: Scheduler: slurm SlurmQueues: - Name: q1 ComputeResources: - - Name: c5n18xl-efa + - Name: c5n-18xl-efa InstanceType: {{ instance }} MinCount: {{ number_of_nodes }} MaxCount: {{ number_of_nodes }} @@ -37,6 +48,15 @@ Scheduling: Iam: AdditionalIamPolicies: - Policy: arn:{{partition}}:iam::aws:policy/AmazonSSMManagedInstanceCore # Required to report patching status +{% if install_extra_deps %} + S3Access: + - BucketName: {{ bucket_name }} + KeyName: scripts/dependencies.install.sh + EnableWriteAccess: false + CustomActions: + OnNodeConfigured: + Script: s3://{{ bucket_name }}/scripts/dependencies.install.sh +{% endif %} SharedStorage: - MountDir: /shared Name: shared-fsx diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py index a98014a265..53905d8336 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm.py +++ b/tests/integration-tests/tests/performance_tests/test_starccm.py @@ -7,6 +7,7 @@ from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor from tests.common.utils import assert_no_file_handler_leak, get_compute_ip_to_num_files +from tests.performance_tests.common import _log_output_performance_difference # timeout in seconds STARCCM_INSTALLATION_TIMEOUT = 1800 @@ -22,7 +23,6 @@ "rhel8": {8: 66.494, 16: 36.154, 32: 20.347}, # v3.6.0 "rocky8": {8: 66.859, 16: 36.184, 32: 21.090}, # v3.8.0 } -PERF_TEST_DIFFERENCE_TOLERANCE = 3 OSS_REQUIRING_EXTRA_DEPS = ["alinux2023", "rhel8", "rocky8"] @@ -34,17 +34,13 @@ def get_starccm_secrets(region_name): return secrets["podkey"], secrets["licpath"] -def perf_test_difference(observed_value, baseline_value): - percentage_difference = 100 * (observed_value - baseline_value) / baseline_value - return percentage_difference - - def starccm_installed(headnode): cmd = "/shared/STAR-CCM+/18.02.008/STAR-CCM+18.02.008/star/bin/starccm+ --version" try: - headnode.run_remote_command(cmd) + headnode.run_remote_command(cmd, log_error=False) return True except RemoteCommandExecutionError: + logging.info("STAR-CCM+ is not installed on the head node.") return False @@ -76,24 +72,13 @@ def test_starccm( region, scheduler, pcluster_config_reader, - clusters_factory, + shared_performance_test_cluster, number_of_nodes, test_datadir, scheduler_commands_factory, s3_bucket_factory, ): - # Create S3 bucket for custom actions scripts - bucket_name = s3_bucket_factory() - s3 = boto3.client("s3") - s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh") - - cluster_config = pcluster_config_reader( - bucket_name=bucket_name, - install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS, - number_of_nodes=max(number_of_nodes), - ) - cluster = clusters_factory(cluster_config) - logging.info("Cluster Created") + cluster = shared_performance_test_cluster(instance, os, region, scheduler) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = scheduler_commands_factory(remote_command_executor) init_num_files = get_compute_ip_to_num_files(remote_command_executor, scheduler_commands) @@ -126,22 +111,7 @@ def test_starccm( # Check results and log performance degradation for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]): baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node] - percentage_difference = perf_test_difference(observed_value, baseline_value) - if percentage_difference < 0: - outcome = "improvement" - elif percentage_difference <= PERF_TEST_DIFFERENCE_TOLERANCE: - outcome = "degradation (within tolerance)" - else: - outcome = "degradation (above tolerance)" - performance_degradation[node] = { - "baseline": baseline_value, - "observed": observed_value, - "percentage_difference": percentage_difference, - } - logging.info( - f"Nodes: {node}, Baseline: {baseline_value} seconds, Observed: {observed_value} seconds, " - f"Percentage difference: {percentage_difference}%, Outcome: {outcome}" - ) + _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value) assert_no_file_handler_leak(init_num_files, remote_command_executor, scheduler_commands) diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml index 3c456e7a13..6a317a9767 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml +++ b/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml @@ -1,6 +1,8 @@ Region: {{ region }} Image: Os: {{ os }} +Imds: + ImdsSupport: v2.0 HeadNode: InstanceType: {{ instance }} Networking: @@ -11,6 +13,9 @@ HeadNode: AdditionalIamPolicies: - Policy: arn:{{partition}}:iam::aws:policy/AmazonSSMManagedInstanceCore #Required to report patching status S3Access: + - BucketName: performance-tests-resources-for-parallelcluster + KeyName: openfoam/* + EnableWriteAccess: false - BucketName: performance-tests-resources-for-parallelcluster KeyName: starccm/* EnableWriteAccess: false @@ -59,5 +64,7 @@ SharedStorage: FsxLustreSettings: StorageCapacity: 2400 DeploymentType: PERSISTENT_1 + AutomaticBackupRetentionDays: 30 + DailyAutomaticBackupStartTime: 00:00 PerUnitStorageThroughput: 100 StorageType: SSD From 8518d79deff15ed9657f43c26c21e331564ad17c Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 25 Jun 2024 22:50:09 -0400 Subject: [PATCH 05/14] Change the shared_performance_test_cluster scope from session to class --- tests/integration-tests/tests/performance_tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/performance_tests/conftest.py b/tests/integration-tests/tests/performance_tests/conftest.py index fd2705a98d..1cb8f7b114 100644 --- a/tests/integration-tests/tests/performance_tests/conftest.py +++ b/tests/integration-tests/tests/performance_tests/conftest.py @@ -19,7 +19,7 @@ NUMBER_OF_NODES = [8, 16, 32] -@pytest.fixture(scope="session") +@pytest.fixture(scope="class") def shared_performance_test_cluster( vpc_stack, pcluster_config_reader, clusters_factory, test_datadir, s3_bucket_factory ): From 84ba647b5d154b261d25507b5fe38a921cefa12c Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Wed, 26 Jun 2024 10:06:14 -0400 Subject: [PATCH 06/14] Add shared fixture to test --- .../tests/performance_tests/conftest.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tests/integration-tests/tests/performance_tests/conftest.py b/tests/integration-tests/tests/performance_tests/conftest.py index 1cb8f7b114..b5fc7fa689 100644 --- a/tests/integration-tests/tests/performance_tests/conftest.py +++ b/tests/integration-tests/tests/performance_tests/conftest.py @@ -11,9 +11,15 @@ # See the License for the specific language governing permissions and limitations under the License. import logging +import os import boto3 import pytest +from jinja2 import FileSystemLoader +from jinja2.sandbox import SandboxedEnvironment + +from conftest import _get_default_template_values, inject_additional_config_settings, \ + inject_additional_image_configs_settings OSS_REQUIRING_EXTRA_DEPS = ["alinux2023", "rhel8", "rocky8"] NUMBER_OF_NODES = [8, 16, 32] @@ -39,3 +45,55 @@ def _shared_performance_test_cluster(instance, os, region, scheduler): return cluster return _shared_performance_test_cluster + + +@pytest.fixture(scope="class") +def shared_test_datadir(request, datadir): + """ + Inject the datadir with resources for the specific test function. + + If the test function is declared in a class then datadir is ClassName/FunctionName + otherwise it is only FunctionName. + """ + function_name = request.function.__name__ + if not request.cls: + return datadir / function_name + + class_name = request.cls.__name__ + return datadir / "{0}/{1}".format(class_name, function_name) + + +@pytest.fixture(scope="class") +def shared_pcluster_config_reader(test_datadir, vpc_stack, request, region): + """ + Define a fixture to render pcluster config templates associated to the running test. + + The config for a given test is a pcluster.config.yaml file stored in the configs_datadir folder. + The config can be written by using Jinja2 template engine. + The current renderer already replaces placeholders for current keys: + {{ region }}, {{ os }}, {{ instance }}, {{ scheduler}}, {{ key_name }}, + {{ vpc_id }}, {{ public_subnet_id }}, {{ private_subnet_id }}, {{ default_vpc_security_group_id }} + The current renderer injects options for custom templates and packages in case these + are passed to the cli and not present already in the cluster config. + Also sanity_check is set to true by default unless explicitly set in config. + + :return: a _config_renderer(**kwargs) function which gets as input a dictionary of values to replace in the template + """ + + def _config_renderer(config_file="pcluster.config.yaml", benchmarks=None, output_file=None, **kwargs): + config_file_path = test_datadir / config_file + if not os.path.isfile(config_file_path): + raise FileNotFoundError(f"Cluster config file not found in the expected dir {config_file_path}") + output_file_path = test_datadir / output_file if output_file else config_file_path + default_values = _get_default_template_values(vpc_stack, request) + file_loader = FileSystemLoader(str(test_datadir)) + env = SandboxedEnvironment(loader=file_loader) + rendered_template = env.get_template(config_file).render(**{**default_values, **kwargs}) + output_file_path.write_text(rendered_template) + if not config_file.endswith("image.config.yaml"): + inject_additional_config_settings(output_file_path, request, region, benchmarks) + else: + inject_additional_image_configs_settings(output_file_path, request) + return output_file_path + + return _config_renderer From e3210d63d17cc4e48a0ee87b48e7452362afbe97 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Wed, 26 Jun 2024 16:28:32 -0400 Subject: [PATCH 07/14] Use shared fixture in shared cluster fixture for now to test, will modify in the future --- tests/integration-tests/tests/performance_tests/conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/performance_tests/conftest.py b/tests/integration-tests/tests/performance_tests/conftest.py index b5fc7fa689..2176ee91a2 100644 --- a/tests/integration-tests/tests/performance_tests/conftest.py +++ b/tests/integration-tests/tests/performance_tests/conftest.py @@ -27,15 +27,15 @@ @pytest.fixture(scope="class") def shared_performance_test_cluster( - vpc_stack, pcluster_config_reader, clusters_factory, test_datadir, s3_bucket_factory + vpc_stack, shared_pcluster_config_reader, clusters_factory, shared_test_datadir, s3_bucket_factory ): def _shared_performance_test_cluster(instance, os, region, scheduler): bucket_name = s3_bucket_factory() s3 = boto3.client("s3") - s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh") + s3.upload_file(str(shared_test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh") - cluster_config = pcluster_config_reader( + cluster_config = shared_pcluster_config_reader( bucket_name=bucket_name, install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS, number_of_nodes=max(NUMBER_OF_NODES), From c5d59056459cd474c014ce9ce854342fe2c93637 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Sun, 30 Jun 2024 21:23:09 -0400 Subject: [PATCH 08/14] Restructure. Group two tests in a moudule, restructure files to adopt changes. --- tests/integration-tests/conftest.py | 81 +++++++++++- .../tests/performance_tests/conftest.py | 99 -------------- .../tests/performance_tests/test_openfoam.py | 95 -------------- ...tarccm.py => test_starccm_and_openfoam.py} | 122 +++++++++++++++++- .../test_openfoam/dependencies.install.sh | 0 .../test_openfoam/openfoam.install.sh | 0 .../test_openfoam/openfoam.results.sh | 0 .../test_openfoam/openfoam.slurm.sh | 0 .../test_openfoam/pcluster.config.yaml | 0 .../test_starccm/dependencies.install.sh | 0 .../test_starccm/pcluster.config.yaml | 0 .../test_starccm/starccm.install.sh | 0 .../test_starccm/starccm.results.sh | 0 .../test_starccm/starccm.slurm.sh | 0 14 files changed, 197 insertions(+), 200 deletions(-) delete mode 100644 tests/integration-tests/tests/performance_tests/conftest.py delete mode 100644 tests/integration-tests/tests/performance_tests/test_openfoam.py rename tests/integration-tests/tests/performance_tests/{test_starccm.py => test_starccm_and_openfoam.py} (50%) rename tests/integration-tests/tests/performance_tests/{test_openfoam => test_starccm_and_openfoam}/test_openfoam/dependencies.install.sh (100%) rename tests/integration-tests/tests/performance_tests/{test_openfoam => test_starccm_and_openfoam}/test_openfoam/openfoam.install.sh (100%) rename tests/integration-tests/tests/performance_tests/{test_openfoam => test_starccm_and_openfoam}/test_openfoam/openfoam.results.sh (100%) rename tests/integration-tests/tests/performance_tests/{test_openfoam => test_starccm_and_openfoam}/test_openfoam/openfoam.slurm.sh (100%) rename tests/integration-tests/tests/performance_tests/{test_openfoam => test_starccm_and_openfoam}/test_openfoam/pcluster.config.yaml (100%) rename tests/integration-tests/tests/performance_tests/{test_starccm => test_starccm_and_openfoam}/test_starccm/dependencies.install.sh (100%) rename tests/integration-tests/tests/performance_tests/{test_starccm => test_starccm_and_openfoam}/test_starccm/pcluster.config.yaml (100%) rename tests/integration-tests/tests/performance_tests/{test_starccm => test_starccm_and_openfoam}/test_starccm/starccm.install.sh (100%) rename tests/integration-tests/tests/performance_tests/{test_starccm => test_starccm_and_openfoam}/test_starccm/starccm.results.sh (100%) rename tests/integration-tests/tests/performance_tests/{test_starccm => test_starccm_and_openfoam}/test_starccm/starccm.slurm.sh (100%) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index a25cfc5f59..c2ad5151e5 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -21,6 +21,7 @@ from functools import partial from itertools import product from shutil import copyfile +from time import sleep from traceback import format_tb from typing import Any, Dict, List, Optional, Union @@ -366,6 +367,70 @@ def _setup_custom_logger(log_file): logger.addHandler(file_handler) +class SharedClusterDetectionTimeoutError(Exception): + """Custom exception for shared cluster detection timeout.""" + + pass + + +@pytest.fixture(scope="module") +@pytest.mark.usefixtures("setup_credentials") +def shared_clusters_factory(request): + """ + Define a fixture to manage the creation and destruction of module shared clusters. + + The configs used to create clusters are dumped to output_dir/clusters_configs/{test_name}.config + """ + factory = ClustersFactory(delete_logs_on_success=request.config.getoption("delete_logs_on_success")) + + if not hasattr(request.module, "shared_existing_cluster"): + request.module.shared_existing_cluster = None + request.module.is_cluster_started_to_create = False + + def _cluster_factory(cluster_config, region, upper_case_cluster_name=False, custom_cli_credentials=None, **kwargs): + if request.module.is_cluster_started_to_create: + for retry in range(40): + if request.module.shared_existing_cluster: + logging.info(f"Shared cluster {request.module.shared_existing_cluster.name} detected.") + return request.module.shared_existing_cluster + else: + logging.info(f"Shared cluster not detected yet. Retrying... ({retry + 1}/40)") + sleep(60) + raise SharedClusterDetectionTimeoutError( + "Timeout: Failed to detect the shared cluster within the allowed retries." + ) + + request.module.is_cluster_started_to_create = True + cluster_config = _write_config_to_outdir(request, cluster_config, "clusters_configs") + cluster = Cluster( + name=( + request.config.getoption("cluster") + if request.config.getoption("cluster") + else "integ-tests-{0}{1}{2}".format( + random_alphanumeric().upper() if upper_case_cluster_name else random_alphanumeric(), + "-" if request.config.getoption("stackname_suffix") else "", + request.config.getoption("stackname_suffix"), + ) + ), + config_file=cluster_config, + ssh_key=request.config.getoption("key_path"), + region=region, + custom_cli_credentials=custom_cli_credentials, + ) + if not request.config.getoption("cluster"): + cluster.creation_response = factory.create_cluster(cluster, **kwargs) + request.module.shared_existing_cluster = cluster + return cluster + + yield _cluster_factory + if not request.config.getoption("no_delete"): + try: + test_passed = request.node.rep_call.passed + except AttributeError: + test_passed = False + factory.destroy_all_clusters(test_passed=test_passed) + + @pytest.fixture(scope="class") @pytest.mark.usefixtures("setup_credentials") def clusters_factory(request, region): @@ -509,9 +574,21 @@ def _write_config_to_outdir(request, config, config_dir): out_dir = request.config.getoption("output_dir") # Sanitize config file name to make it Windows compatible - # request.node.nodeid example: + # class scope request.node.nodeid example: # 'dcv/test_dcv.py::test_dcv_configuration[eu-west-1-c5.xlarge-centos7-slurm-8443-0.0.0.0/0-/shared]' - test_file, test_name = request.node.nodeid.split("::", 1) + # module scope request.node.nodeid example: + # 'performance_tests/test_starccm_and_openfoam.py' + # TODO: Find a better way to name module_scope_test + logging.info(f"request.node.nodeid: {request.node.nodeid}") + nodeid_parts = request.node.nodeid.split("::") + if len(nodeid_parts) == 2: + test_file, test_name = nodeid_parts + elif len(nodeid_parts) == 1: + test_file = nodeid_parts[0] + test_name = "module_scope_test" + else: + raise ValueError(f"Unexpected nodeid format: {request.node.nodeid}") + config_file_name = "{0}-{1}".format(test_file, test_name.replace("/", "_")) os.makedirs( diff --git a/tests/integration-tests/tests/performance_tests/conftest.py b/tests/integration-tests/tests/performance_tests/conftest.py deleted file mode 100644 index 2176ee91a2..0000000000 --- a/tests/integration-tests/tests/performance_tests/conftest.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. -# A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. -# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. -# See the License for the specific language governing permissions and limitations under the License. - -import logging -import os - -import boto3 -import pytest -from jinja2 import FileSystemLoader -from jinja2.sandbox import SandboxedEnvironment - -from conftest import _get_default_template_values, inject_additional_config_settings, \ - inject_additional_image_configs_settings - -OSS_REQUIRING_EXTRA_DEPS = ["alinux2023", "rhel8", "rocky8"] -NUMBER_OF_NODES = [8, 16, 32] - - -@pytest.fixture(scope="class") -def shared_performance_test_cluster( - vpc_stack, shared_pcluster_config_reader, clusters_factory, shared_test_datadir, s3_bucket_factory -): - - def _shared_performance_test_cluster(instance, os, region, scheduler): - bucket_name = s3_bucket_factory() - s3 = boto3.client("s3") - s3.upload_file(str(shared_test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh") - - cluster_config = shared_pcluster_config_reader( - bucket_name=bucket_name, - install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS, - number_of_nodes=max(NUMBER_OF_NODES), - ) - cluster = clusters_factory(cluster_config) - logging.info("Cluster Created") - return cluster - - return _shared_performance_test_cluster - - -@pytest.fixture(scope="class") -def shared_test_datadir(request, datadir): - """ - Inject the datadir with resources for the specific test function. - - If the test function is declared in a class then datadir is ClassName/FunctionName - otherwise it is only FunctionName. - """ - function_name = request.function.__name__ - if not request.cls: - return datadir / function_name - - class_name = request.cls.__name__ - return datadir / "{0}/{1}".format(class_name, function_name) - - -@pytest.fixture(scope="class") -def shared_pcluster_config_reader(test_datadir, vpc_stack, request, region): - """ - Define a fixture to render pcluster config templates associated to the running test. - - The config for a given test is a pcluster.config.yaml file stored in the configs_datadir folder. - The config can be written by using Jinja2 template engine. - The current renderer already replaces placeholders for current keys: - {{ region }}, {{ os }}, {{ instance }}, {{ scheduler}}, {{ key_name }}, - {{ vpc_id }}, {{ public_subnet_id }}, {{ private_subnet_id }}, {{ default_vpc_security_group_id }} - The current renderer injects options for custom templates and packages in case these - are passed to the cli and not present already in the cluster config. - Also sanity_check is set to true by default unless explicitly set in config. - - :return: a _config_renderer(**kwargs) function which gets as input a dictionary of values to replace in the template - """ - - def _config_renderer(config_file="pcluster.config.yaml", benchmarks=None, output_file=None, **kwargs): - config_file_path = test_datadir / config_file - if not os.path.isfile(config_file_path): - raise FileNotFoundError(f"Cluster config file not found in the expected dir {config_file_path}") - output_file_path = test_datadir / output_file if output_file else config_file_path - default_values = _get_default_template_values(vpc_stack, request) - file_loader = FileSystemLoader(str(test_datadir)) - env = SandboxedEnvironment(loader=file_loader) - rendered_template = env.get_template(config_file).render(**{**default_values, **kwargs}) - output_file_path.write_text(rendered_template) - if not config_file.endswith("image.config.yaml"): - inject_additional_config_settings(output_file_path, request, region, benchmarks) - else: - inject_additional_image_configs_settings(output_file_path, request) - return output_file_path - - return _config_renderer diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam.py b/tests/integration-tests/tests/performance_tests/test_openfoam.py deleted file mode 100644 index 0538db7f2b..0000000000 --- a/tests/integration-tests/tests/performance_tests/test_openfoam.py +++ /dev/null @@ -1,95 +0,0 @@ -import logging -from concurrent.futures.thread import ThreadPoolExecutor - -import pytest -from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor - -from tests.performance_tests.common import _log_output_performance_difference - -# timeout in seconds -OPENFOAM_INSTALLATION_TIMEOUT = 300 -OPENFOAM_JOB_TIMEOUT = 5400 # Takes long time because during the first time, it's not only execute the job but also -# builds and installs many things -TASK_VCPUS = 36 # vCPUs are cut in a half because multithreading is disabled -BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS = { - "alinux2": {8: 754, 16: 366, 32: 182}, # v3.1.3 - "ubuntu2204": {8: 742, 16: 376, 32: 185}, # v3.7.0 just a placeholder, Ubuntu22.04 not supported - "ubuntu2004": {8: 750, 16: 382, 32: 187}, # v3.1.3 - "centos7": {8: 755, 16: 371, 32: 190}, # v3.1.3 - "rhel8": {8: 742, 16: 376, 32: 185}, # v3.6.0 just a placeholder, RHEL8 not supported - "rocky8": {8: 742, 16: 376, 32: 185}, # v3.8.0 just a placeholder, Rocky8 not supported -} - - -def openfoam_installed(headnode): - cmd = '[ -d "/shared/SubspaceBenchmarks" ]' - try: - headnode.run_remote_command(cmd, log_error=False) - return True - except RemoteCommandExecutionError: - logging.info("OpenFOAM is not installed on the head node.") - return False - - -def run_openfoam_test(remote_command_executor, test_datadir, number_of_nodes): - subspace_benchmarks_dir = "/shared/SubspaceBenchmarks" - logging.info(f"Submitting OpenFOAM job with {number_of_nodes} nodes") - remote_command_executor.run_remote_command( - f'bash openfoam.slurm.sh "{subspace_benchmarks_dir}" "{number_of_nodes}" 2>&1', - timeout=OPENFOAM_JOB_TIMEOUT, - ) - perf_test_result = remote_command_executor.run_remote_script( - (str(test_datadir / "openfoam.results.sh")), hide=False - ) - output = perf_test_result.stdout.strip() - observed_value = int(output.split("\n")[-1].strip()) - logging.info(f"The elapsed time for {number_of_nodes} nodes is {observed_value} seconds") - return observed_value - - -@pytest.mark.parametrize( - "number_of_nodes", - [[8, 16, 32]], -) -def test_openfoam( - vpc_stack, - instance, - os, - region, - scheduler, - pcluster_config_reader, - shared_performance_test_cluster, - number_of_nodes, - test_datadir, -): - cluster = shared_performance_test_cluster(instance, os, region, scheduler) - remote_command_executor = RemoteCommandExecutor(cluster) - if not openfoam_installed(remote_command_executor): - logging.info("Installing OpenFOAM") - remote_command_executor.run_remote_script( - str(test_datadir / "openfoam.install.sh"), timeout=OPENFOAM_INSTALLATION_TIMEOUT, hide=False - ) - logging.info("OpenFOAM Installed") - performance_degradation = {} - - # Copy additional files in advanced to avoid conflict when running 8 and 16 nodes tests in parallel - remote_command_executor._copy_additional_files([str(test_datadir / "openfoam.slurm.sh")]) - # Run 8 and 16 node tests in parallel - with ThreadPoolExecutor(max_workers=2) as executor: - future_8 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 8) - future_16 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 16) - observed_value_8 = future_8.result() - observed_value_16 = future_16.result() - - # Run 32 node test - observed_value_32 = run_openfoam_test(remote_command_executor, test_datadir, 32) - - # Check results and log performance degradation - for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]): - baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node] - _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value) - - if performance_degradation: - pytest.fail(f"Performance degradation detected: {performance_degradation}") - else: - logging.info("Performance test results show no performance degradation") diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py similarity index 50% rename from tests/integration-tests/tests/performance_tests/test_starccm.py rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py index 53905d8336..b02f579f4d 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm.py +++ b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py @@ -13,8 +13,12 @@ STARCCM_INSTALLATION_TIMEOUT = 1800 STARCCM_JOB_TIMEOUT = 600 STARCCM_LICENCE_SECRET = "starccm-license-secret" + +OPENFOAM_INSTALLATION_TIMEOUT = 300 +OPENFOAM_JOB_TIMEOUT = 5400 # Takes long time because during the first time, it's not only execute the job but also + TASK_VCPUS = 36 # vCPUs are cut in a half because multithreading is disabled -BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS = { +BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS_STARCCM = { "alinux2023": {8: 62.414, 16: 31.998, 32: 20.422}, # v3.10.0 "alinux2": {8: 64.475, 16: 33.173, 32: 17.899}, # v3.1.3 "ubuntu2204": {8: 75.502, 16: 36.353, 32: 19.688}, # v3.7.0 @@ -24,6 +28,15 @@ "rocky8": {8: 66.859, 16: 36.184, 32: 21.090}, # v3.8.0 } +BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS_OPENFOAM = { + "alinux2": {8: 754, 16: 366, 32: 182}, # v3.1.3 + "ubuntu2204": {8: 742, 16: 376, 32: 185}, # v3.7.0 just a placeholder, Ubuntu22.04 not supported + "ubuntu2004": {8: 750, 16: 382, 32: 187}, # v3.1.3 + "centos7": {8: 755, 16: 371, 32: 190}, # v3.1.3 + "rhel8": {8: 742, 16: 376, 32: 185}, # v3.6.0 just a placeholder, RHEL8 not supported + "rocky8": {8: 742, 16: 376, 32: 185}, # v3.8.0 just a placeholder, Rocky8 not supported +} + OSS_REQUIRING_EXTRA_DEPS = ["alinux2023", "rhel8", "rocky8"] @@ -34,6 +47,32 @@ def get_starccm_secrets(region_name): return secrets["podkey"], secrets["licpath"] +def openfoam_installed(headnode): + cmd = '[ -d "/shared/SubspaceBenchmarks" ]' + try: + headnode.run_remote_command(cmd, log_error=False) + return True + except RemoteCommandExecutionError: + logging.info("OpenFOAM is not installed on the head node.") + return False + + +def run_openfoam_test(remote_command_executor, test_datadir, number_of_nodes): + subspace_benchmarks_dir = "/shared/SubspaceBenchmarks" + logging.info(f"Submitting OpenFOAM job with {number_of_nodes} nodes") + remote_command_executor.run_remote_command( + f'bash openfoam.slurm.sh "{subspace_benchmarks_dir}" "{number_of_nodes}" 2>&1', + timeout=OPENFOAM_JOB_TIMEOUT, + ) + perf_test_result = remote_command_executor.run_remote_script( + (str(test_datadir / "openfoam.results.sh")), hide=False + ) + output = perf_test_result.stdout.strip() + observed_value = int(output.split("\n")[-1].strip()) + logging.info(f"The elapsed time for {number_of_nodes} nodes is {observed_value} seconds") + return observed_value + + def starccm_installed(headnode): cmd = "/shared/STAR-CCM+/18.02.008/STAR-CCM+18.02.008/star/bin/starccm+ --version" try: @@ -72,13 +111,27 @@ def test_starccm( region, scheduler, pcluster_config_reader, - shared_performance_test_cluster, + shared_clusters_factory, number_of_nodes, test_datadir, scheduler_commands_factory, s3_bucket_factory, ): - cluster = shared_performance_test_cluster(instance, os, region, scheduler) + logging.info("start to create s3") + bucket_name = s3_bucket_factory() + s3 = boto3.client("s3") + s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh") + + cluster_config = pcluster_config_reader( + bucket_name=bucket_name, + install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS, + number_of_nodes=max(number_of_nodes), + ) + test_region = region + logging.info(f"test region is {test_region}") + cluster = shared_clusters_factory(cluster_config, test_region) + logging.info("Cluster Created") + remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = scheduler_commands_factory(remote_command_executor) init_num_files = get_compute_ip_to_num_files(remote_command_executor, scheduler_commands) @@ -110,7 +163,7 @@ def test_starccm( # Check results and log performance degradation for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]): - baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node] + baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS_STARCCM[os][node] _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value) assert_no_file_handler_leak(init_num_files, remote_command_executor, scheduler_commands) @@ -119,3 +172,64 @@ def test_starccm( pytest.fail(f"Performance degradation detected: {performance_degradation}") else: logging.info("Performance test results show no performance degradation") + + +@pytest.mark.parametrize( + "number_of_nodes", + [[8, 16, 32]], +) +def test_openfoam( + vpc_stack, + instance, + os, + region, + scheduler, + pcluster_config_reader, + shared_clusters_factory, + number_of_nodes, + test_datadir, + s3_bucket_factory, +): + bucket_name = s3_bucket_factory() + s3 = boto3.client("s3") + s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh") + + cluster_config = pcluster_config_reader( + bucket_name=bucket_name, + install_extra_deps=os in number_of_nodes, + number_of_nodes=max(number_of_nodes), + ) + test_region = region + logging.info(f"test region is {test_region}") + cluster = shared_clusters_factory(cluster_config, test_region) + logging.info("Cluster Created") + remote_command_executor = RemoteCommandExecutor(cluster) + if not openfoam_installed(remote_command_executor): + logging.info("Installing OpenFOAM") + remote_command_executor.run_remote_script( + str(test_datadir / "openfoam.install.sh"), timeout=OPENFOAM_INSTALLATION_TIMEOUT, hide=False + ) + logging.info("OpenFOAM Installed") + performance_degradation = {} + + # Copy additional files in advanced to avoid conflict when running 8 and 16 nodes tests in parallel + remote_command_executor._copy_additional_files([str(test_datadir / "openfoam.slurm.sh")]) + # Run 8 and 16 node tests in parallel + with ThreadPoolExecutor(max_workers=2) as executor: + future_8 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 8) + future_16 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 16) + observed_value_8 = future_8.result() + observed_value_16 = future_16.result() + + # Run 32 node test + observed_value_32 = run_openfoam_test(remote_command_executor, test_datadir, 32) + + # Check results and log performance degradation + for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]): + baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS_OPENFOAM[os][node] + _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value) + + if performance_degradation: + pytest.fail(f"Performance degradation detected: {performance_degradation}") + else: + logging.info("Performance test results show no performance degradation") diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/dependencies.install.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/dependencies.install.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/dependencies.install.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/dependencies.install.sh diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.install.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.install.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.install.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.install.sh diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.results.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.results.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.results.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.results.sh diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.slurm.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.slurm.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.slurm.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.slurm.sh diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/pcluster.config.yaml similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/pcluster.config.yaml diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/dependencies.install.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/dependencies.install.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/dependencies.install.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/dependencies.install.sh diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/pcluster.config.yaml similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/pcluster.config.yaml diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.install.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.install.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.install.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.install.sh diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.results.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.results.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.results.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.results.sh diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.slurm.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.slurm.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.slurm.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.slurm.sh From ab38a3522188e75e55d8056df3fa69283592d0bc Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 1 Jul 2024 16:05:45 -0400 Subject: [PATCH 09/14] Modify hasattr logic, add logging info to debug --- tests/integration-tests/conftest.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index c2ad5151e5..fe15d4cbea 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -373,6 +373,12 @@ class SharedClusterDetectionTimeoutError(Exception): pass +class SharedClusterDetectionTimeoutError(Exception): + """Custom exception for shared cluster detection timeout.""" + + pass + + @pytest.fixture(scope="module") @pytest.mark.usefixtures("setup_credentials") def shared_clusters_factory(request): @@ -383,11 +389,17 @@ def shared_clusters_factory(request): """ factory = ClustersFactory(delete_logs_on_success=request.config.getoption("delete_logs_on_success")) - if not hasattr(request.module, "shared_existing_cluster"): - request.module.shared_existing_cluster = None + if not hasattr(request.module, "is_cluster_started_to_create"): + logging.info("Setting is_cluster_started_to_create and shared_existing_cluster") request.module.is_cluster_started_to_create = False + request.module.shared_existing_cluster = None def _cluster_factory(cluster_config, region, upper_case_cluster_name=False, custom_cli_credentials=None, **kwargs): + logging.info( + "Shared cluster already started to create" + if request.module.is_cluster_started_to_create + else "Start to create shared cluster" + ) if request.module.is_cluster_started_to_create: for retry in range(40): if request.module.shared_existing_cluster: From 86345309fc2344da89372d9bdc667d7cac85bc23 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 2 Jul 2024 12:49:16 -0400 Subject: [PATCH 10/14] Delete dupicate errorexception --- tests/integration-tests/conftest.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index fe15d4cbea..70e0154fb2 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -369,13 +369,6 @@ def _setup_custom_logger(log_file): class SharedClusterDetectionTimeoutError(Exception): """Custom exception for shared cluster detection timeout.""" - - pass - - -class SharedClusterDetectionTimeoutError(Exception): - """Custom exception for shared cluster detection timeout.""" - pass From 04132f315f83e3ff24f2bf925eb0887610f473fa Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 2 Jul 2024 18:44:16 -0400 Subject: [PATCH 11/14] Use xdist_session_fixture, use hashmap and set to adopt changes --- tests/integration-tests/conftest.py | 38 ++++++++++--------- .../test_starccm_and_openfoam.py | 8 +--- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 70e0154fb2..ff8fe5c955 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -372,32 +372,33 @@ class SharedClusterDetectionTimeoutError(Exception): pass -@pytest.fixture(scope="module") +@xdist_session_fixture(autouse=True) @pytest.mark.usefixtures("setup_credentials") def shared_clusters_factory(request): """ - Define a fixture to manage the creation and destruction of module shared clusters. + Define a fixture to manage the creation and destruction of session shared clusters. The configs used to create clusters are dumped to output_dir/clusters_configs/{test_name}.config """ factory = ClustersFactory(delete_logs_on_success=request.config.getoption("delete_logs_on_success")) - if not hasattr(request.module, "is_cluster_started_to_create"): - logging.info("Setting is_cluster_started_to_create and shared_existing_cluster") - request.module.is_cluster_started_to_create = False - request.module.shared_existing_cluster = None + if not hasattr(request.session, "shared_existing_clusters"): + logging.info("Setting shared_existing_clusters_started_to_create and shared_existing_clusters") + request.session.shared_existing_clusters = {} + request.session.shared_existing_clusters_started_to_create = set() - def _cluster_factory(cluster_config, region, upper_case_cluster_name=False, custom_cli_credentials=None, **kwargs): + def _cluster_factory(cluster_config, region, instance, os, scheduler, upper_case_cluster_name=False, custom_cli_credentials=None, **kwargs): + cluster_key = f"{region}-{instance}-{os}-{scheduler}" logging.info( - "Shared cluster already started to create" - if request.module.is_cluster_started_to_create - else "Start to create shared cluster" + "Eligible for using shared cluster, start to detect." + if cluster_key in request.session.shared_existing_clusters_started_to_create + else "Start to create shared cluster for specific region, instance type, os and scheduler" ) - if request.module.is_cluster_started_to_create: + if cluster_key in request.session.shared_existing_clusters_started_to_create: for retry in range(40): - if request.module.shared_existing_cluster: - logging.info(f"Shared cluster {request.module.shared_existing_cluster.name} detected.") - return request.module.shared_existing_cluster + if cluster_key in request.session.shared_existing_clusters: + logging.info(f"Shared cluster {request.session.shared_existing_clusters[cluster_key].name} detected.") + return request.session.shared_existing_clusters[cluster_key] else: logging.info(f"Shared cluster not detected yet. Retrying... ({retry + 1}/40)") sleep(60) @@ -405,7 +406,7 @@ def _cluster_factory(cluster_config, region, upper_case_cluster_name=False, cust "Timeout: Failed to detect the shared cluster within the allowed retries." ) - request.module.is_cluster_started_to_create = True + request.session.shared_existing_clusters_started_to_create.add(cluster_key) cluster_config = _write_config_to_outdir(request, cluster_config, "clusters_configs") cluster = Cluster( name=( @@ -424,10 +425,11 @@ def _cluster_factory(cluster_config, region, upper_case_cluster_name=False, cust ) if not request.config.getoption("cluster"): cluster.creation_response = factory.create_cluster(cluster, **kwargs) - request.module.shared_existing_cluster = cluster + request.session.shared_existing_clusters[cluster_key] = cluster return cluster yield _cluster_factory + if not request.config.getoption("no_delete"): try: test_passed = request.node.rep_call.passed @@ -583,14 +585,14 @@ def _write_config_to_outdir(request, config, config_dir): # 'dcv/test_dcv.py::test_dcv_configuration[eu-west-1-c5.xlarge-centos7-slurm-8443-0.0.0.0/0-/shared]' # module scope request.node.nodeid example: # 'performance_tests/test_starccm_and_openfoam.py' - # TODO: Find a better way to name module_scope_test + # TODO: Find a better way to name module_scope_test/session_scope_test logging.info(f"request.node.nodeid: {request.node.nodeid}") nodeid_parts = request.node.nodeid.split("::") if len(nodeid_parts) == 2: test_file, test_name = nodeid_parts elif len(nodeid_parts) == 1: test_file = nodeid_parts[0] - test_name = "module_scope_test" + test_name = "module_scope_test" + random_alphanumeric() else: raise ValueError(f"Unexpected nodeid format: {request.node.nodeid}") diff --git a/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py index b02f579f4d..fb76ecd234 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py +++ b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py @@ -127,9 +127,7 @@ def test_starccm( install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS, number_of_nodes=max(number_of_nodes), ) - test_region = region - logging.info(f"test region is {test_region}") - cluster = shared_clusters_factory(cluster_config, test_region) + cluster = shared_clusters_factory(cluster_config, region, instance, os, scheduler) logging.info("Cluster Created") remote_command_executor = RemoteCommandExecutor(cluster) @@ -199,9 +197,7 @@ def test_openfoam( install_extra_deps=os in number_of_nodes, number_of_nodes=max(number_of_nodes), ) - test_region = region - logging.info(f"test region is {test_region}") - cluster = shared_clusters_factory(cluster_config, test_region) + cluster = shared_clusters_factory(cluster_config, region, instance, os, scheduler) logging.info("Cluster Created") remote_command_executor = RemoteCommandExecutor(cluster) if not openfoam_installed(remote_command_executor): From 9c0dc7d6c5f02dbf86e7897994f08fa60bad2639 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 2 Jul 2024 19:43:21 -0400 Subject: [PATCH 12/14] Use _cluster_factory_wrapper to try to avoid AttributeError: Can't pickle local object --- tests/integration-tests/conftest.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index ff8fe5c955..cfbe9e5c22 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -387,6 +387,10 @@ def shared_clusters_factory(request): request.session.shared_existing_clusters = {} request.session.shared_existing_clusters_started_to_create = set() + yield from _cluster_factory_wrapper(request, factory) + + +def _cluster_factory_wrapper(request, factory): def _cluster_factory(cluster_config, region, instance, os, scheduler, upper_case_cluster_name=False, custom_cli_credentials=None, **kwargs): cluster_key = f"{region}-{instance}-{os}-{scheduler}" logging.info( From 8ee614b5a96034ada9ac384775a9714fe9ac04e2 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 2 Jul 2024 20:57:48 -0400 Subject: [PATCH 13/14] Use Encapsulation Classes to try to avoid AttributeError: Can't pickle local object --- tests/integration-tests/conftest.py | 62 +++++++++++++++++++---------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index cfbe9e5c22..d619bb095c 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -369,30 +369,32 @@ def _setup_custom_logger(log_file): class SharedClusterDetectionTimeoutError(Exception): """Custom exception for shared cluster detection timeout.""" - pass - - -@xdist_session_fixture(autouse=True) -@pytest.mark.usefixtures("setup_credentials") -def shared_clusters_factory(request): - """ - Define a fixture to manage the creation and destruction of session shared clusters. - The configs used to create clusters are dumped to output_dir/clusters_configs/{test_name}.config - """ - factory = ClustersFactory(delete_logs_on_success=request.config.getoption("delete_logs_on_success")) + pass - if not hasattr(request.session, "shared_existing_clusters"): - logging.info("Setting shared_existing_clusters_started_to_create and shared_existing_clusters") - request.session.shared_existing_clusters = {} - request.session.shared_existing_clusters_started_to_create = set() - yield from _cluster_factory_wrapper(request, factory) +class ClusterManager: + """Cluster Manager for shared cluster fixture to avoid AttributeError: Can't pickle local object""" + def __init__(self, request, factory): + self.request = request + self.factory = factory -def _cluster_factory_wrapper(request, factory): - def _cluster_factory(cluster_config, region, instance, os, scheduler, upper_case_cluster_name=False, custom_cli_credentials=None, **kwargs): + def cluster_factory( + self, + cluster_config, + region, + instance, + os, + scheduler, + upper_case_cluster_name=False, + custom_cli_credentials=None, + **kwargs, + ): + """Create cluster or use existing cluster.""" cluster_key = f"{region}-{instance}-{os}-{scheduler}" + request = self.request + factory = self.factory logging.info( "Eligible for using shared cluster, start to detect." if cluster_key in request.session.shared_existing_clusters_started_to_create @@ -401,7 +403,9 @@ def _cluster_factory(cluster_config, region, instance, os, scheduler, upper_case if cluster_key in request.session.shared_existing_clusters_started_to_create: for retry in range(40): if cluster_key in request.session.shared_existing_clusters: - logging.info(f"Shared cluster {request.session.shared_existing_clusters[cluster_key].name} detected.") + logging.info( + f"Shared cluster {request.session.shared_existing_clusters[cluster_key].name} detected." + ) return request.session.shared_existing_clusters[cluster_key] else: logging.info(f"Shared cluster not detected yet. Retrying... ({retry + 1}/40)") @@ -432,7 +436,25 @@ def _cluster_factory(cluster_config, region, instance, os, scheduler, upper_case request.session.shared_existing_clusters[cluster_key] = cluster return cluster - yield _cluster_factory + +@xdist_session_fixture(autouse=True) +@pytest.mark.usefixtures("setup_credentials") +def shared_clusters_factory(request): + """ + Define a fixture to manage the creation and destruction of session shared clusters. + + The configs used to create clusters are dumped to output_dir/clusters_configs/{test_name}.config + """ + factory = ClustersFactory(delete_logs_on_success=request.config.getoption("delete_logs_on_success")) + + if not hasattr(request.session, "shared_existing_clusters"): + logging.info("Setting shared_existing_clusters_started_to_create and shared_existing_clusters") + request.session.shared_existing_clusters = {} + request.session.shared_existing_clusters_started_to_create = set() + + manager = ClusterManager(request, factory) + + yield manager.cluster_factory if not request.config.getoption("no_delete"): try: From a9eb36628189dc37dee15bc362fb0148de62d471 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 2 Jul 2024 21:11:52 -0400 Subject: [PATCH 14/14] yield ClusterManager instead of funtion in it to try to avoid AttributeError: Can't pickle local object --- tests/integration-tests/conftest.py | 2 +- .../tests/performance_tests/test_starccm_and_openfoam.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index d619bb095c..eb2a27df93 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -454,7 +454,7 @@ def shared_clusters_factory(request): manager = ClusterManager(request, factory) - yield manager.cluster_factory + yield manager if not request.config.getoption("no_delete"): try: diff --git a/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py index fb76ecd234..bc0860082c 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py +++ b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py @@ -127,7 +127,7 @@ def test_starccm( install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS, number_of_nodes=max(number_of_nodes), ) - cluster = shared_clusters_factory(cluster_config, region, instance, os, scheduler) + cluster = shared_clusters_factory.cluster_factory(cluster_config, region, instance, os, scheduler) logging.info("Cluster Created") remote_command_executor = RemoteCommandExecutor(cluster) @@ -197,7 +197,7 @@ def test_openfoam( install_extra_deps=os in number_of_nodes, number_of_nodes=max(number_of_nodes), ) - cluster = shared_clusters_factory(cluster_config, region, instance, os, scheduler) + cluster = shared_clusters_factory.cluster_factory(cluster_config, region, instance, os, scheduler) logging.info("Cluster Created") remote_command_executor = RemoteCommandExecutor(cluster) if not openfoam_installed(remote_command_executor):