From bcadc9ecd1914da860fb9e6c0a0716e4b216566e Mon Sep 17 00:00:00 2001
From: Xuanqi He <xuanqihe@amazon.com>
Date: Sun, 23 Jun 2024 23:09:12 -0400
Subject: [PATCH 01/14] Run 8 and 16 node performance tests in parallel

---
 .../tests/performance_tests/test_openfoam.py  | 44 ++++++++++++-----
 .../tests/performance_tests/test_starccm.py   | 47 ++++++++++++++-----
 2 files changed, 65 insertions(+), 26 deletions(-)

diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam.py b/tests/integration-tests/tests/performance_tests/test_openfoam.py
index 9689941300..d33d98fdab 100644
--- a/tests/integration-tests/tests/performance_tests/test_openfoam.py
+++ b/tests/integration-tests/tests/performance_tests/test_openfoam.py
@@ -1,4 +1,5 @@
 import logging
+from concurrent.futures.thread import ThreadPoolExecutor
 
 import pytest
 from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor
@@ -33,6 +34,23 @@ def openfoam_installed(headnode):
         return False
 
 
+def run_openfoam_test(remote_command_executor, test_datadir, number_of_nodes):
+    subspace_benchmarks_dir = "/shared/SubspaceBenchmarks"
+    logging.info(f"Submitting OpenFOAM job with {number_of_nodes} nodes")
+    remote_command_executor.run_remote_command(
+        f'bash openfoam.slurm.sh "{subspace_benchmarks_dir}" "{number_of_nodes}" 2>&1',
+        additional_files=[str(test_datadir / "openfoam.slurm.sh")],
+        timeout=OPENFOAM_JOB_TIMEOUT,
+    )
+    perf_test_result = remote_command_executor.run_remote_script(
+        (str(test_datadir / "openfoam.results.sh")), hide=False
+    )
+    output = perf_test_result.stdout.strip()
+    observed_value = int(output.split("\n")[-1].strip())
+    logging.info(f"The elapsed time for {number_of_nodes} nodes is {observed_value} seconds")
+    return observed_value
+
+
 @pytest.mark.parametrize(
     "number_of_nodes",
     [[8, 16, 32]],
@@ -59,19 +77,19 @@ def test_openfoam(
         )
     logging.info("OpenFOAM Installed")
     performance_degradation = {}
-    subspace_benchmarks_dir = "/shared/SubspaceBenchmarks"
-    for node in number_of_nodes:
-        logging.info(f"Submitting OpenFOAM job with {node} nodes")
-        remote_command_executor.run_remote_command(
-            f'bash openfoam.slurm.sh "{subspace_benchmarks_dir}" "{node}" 2>&1',
-            additional_files=[str(test_datadir / "openfoam.slurm.sh")],
-            timeout=OPENFOAM_JOB_TIMEOUT,
-        )
-        perf_test_result = remote_command_executor.run_remote_script(
-            (str(test_datadir / "openfoam.results.sh")), hide=False
-        )
-        output = perf_test_result.stdout.strip()
-        observed_value = int(output.split("\n")[-1].strip())
+
+    # Run 8 and 16 node tests in parallel
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        future_8 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 8)
+        future_16 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 16)
+        observed_value_8 = future_8.result()
+        observed_value_16 = future_16.result()
+
+    # Run 32 node test
+    observed_value_32 = run_openfoam_test(remote_command_executor, test_datadir, 32)
+
+    # Check results and log performance degradation
+    for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]):
         baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node]
         logging.info(f"The elapsed time for {node} nodes is {observed_value} seconds")
         percentage_difference = perf_test_difference(observed_value, baseline_value)
diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py
index 83dd9f5d9f..f43ce99fb6 100644
--- a/tests/integration-tests/tests/performance_tests/test_starccm.py
+++ b/tests/integration-tests/tests/performance_tests/test_starccm.py
@@ -1,5 +1,6 @@
 import json
 import logging
+from concurrent.futures import ThreadPoolExecutor
 
 import boto3
 import pytest
@@ -47,6 +48,24 @@ def starccm_installed(headnode):
         return False
 
 
+def run_starccm_test(remote_command_executor, scheduler_commands, test_datadir, number_of_nodes, podkey, licpath):
+    num_of_tasks = number_of_nodes * TASK_VCPUS
+    result = remote_command_executor.run_remote_command(
+        f'sbatch --ntasks={num_of_tasks} starccm.slurm.sh "{podkey}" "{licpath}"',
+        additional_files=[str(test_datadir / "starccm.slurm.sh")],
+    )
+    logging.info(f"Submitting StarCCM+ job with {number_of_nodes} nodes")
+    job_id = scheduler_commands.assert_job_submitted(result.stdout)
+    scheduler_commands.wait_job_completed(job_id, timeout=STARCCM_JOB_TIMEOUT)
+    scheduler_commands.assert_job_succeeded(job_id)
+    perf_test_result = remote_command_executor.run_remote_script(
+        (str(test_datadir / "starccm.results.sh")), args=[job_id], hide=False
+    )
+    observed_value = float(perf_test_result.stdout)
+    logging.info(f"The elapsed time for {number_of_nodes} nodes is {observed_value} seconds")
+    return observed_value
+
+
 @pytest.mark.parametrize(
     "number_of_nodes",
     [[8, 16, 32]],
@@ -88,21 +107,23 @@ def test_starccm(
     logging.info("StarCCM+ Installed")
     podkey, licpath = get_starccm_secrets(region)
     performance_degradation = {}
-    for node in number_of_nodes:
-        num_of_tasks = node * TASK_VCPUS
-        result = remote_command_executor.run_remote_command(
-            f'sbatch --ntasks={num_of_tasks} starccm.slurm.sh "{podkey}" "{licpath}"',
-            additional_files=[str(test_datadir / "starccm.slurm.sh")],
+
+    # Run 8 and 16 node tests in parallel
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        future_8 = executor.submit(
+            run_starccm_test, remote_command_executor, scheduler_commands, test_datadir, 8, podkey, licpath
         )
-        logging.info(f"Submitting StarCCM+ job with {node} nodes")
-        job_id = scheduler_commands.assert_job_submitted(result.stdout)
-        scheduler_commands.wait_job_completed(job_id, timeout=STARCCM_JOB_TIMEOUT)
-        scheduler_commands.assert_job_succeeded(job_id)
-        perf_test_result = remote_command_executor.run_remote_script(
-            (str(test_datadir / "starccm.results.sh")), args=[job_id], hide=False
+        future_16 = executor.submit(
+            run_starccm_test, remote_command_executor, scheduler_commands, test_datadir, 16, podkey, licpath
         )
-        observed_value = float(perf_test_result.stdout)
-        logging.info(f"The elapsed time for {node} nodes is {observed_value} seconds")
+        observed_value_8 = future_8.result()
+        observed_value_16 = future_16.result()
+
+    # Run 32 node test
+    observed_value_32 = run_starccm_test(remote_command_executor, scheduler_commands, test_datadir, 32, podkey, licpath)
+
+    # Check results and log performance degradation
+    for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]):
         baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node]
         percentage_difference = perf_test_difference(observed_value, baseline_value)
         if percentage_difference < 0:

From f24e0736c894c35dbc541865d19581ea0708ce76 Mon Sep 17 00:00:00 2001
From: Xuanqi He <xuanqihe@amazon.com>
Date: Mon, 24 Jun 2024 09:38:03 -0400
Subject: [PATCH 02/14] Delete a duplicate logging info

---
 tests/integration-tests/tests/performance_tests/test_openfoam.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam.py b/tests/integration-tests/tests/performance_tests/test_openfoam.py
index d33d98fdab..d22e6371de 100644
--- a/tests/integration-tests/tests/performance_tests/test_openfoam.py
+++ b/tests/integration-tests/tests/performance_tests/test_openfoam.py
@@ -91,7 +91,6 @@ def test_openfoam(
     # Check results and log performance degradation
     for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]):
         baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node]
-        logging.info(f"The elapsed time for {node} nodes is {observed_value} seconds")
         percentage_difference = perf_test_difference(observed_value, baseline_value)
         if percentage_difference < 0:
             outcome = "improvement"

From b76afb9ca98d9c6f06b9d9dc10747ee1a6fc1603 Mon Sep 17 00:00:00 2001
From: Xuanqi He <xuanqihe@amazon.com>
Date: Mon, 24 Jun 2024 12:40:05 -0400
Subject: [PATCH 03/14] Copy additional files in advanced to avoid conflict
 when running 8 and 16 nodes tests in parallel

---
 .../tests/performance_tests/test_openfoam.py                 | 3 ++-
 .../tests/performance_tests/test_starccm.py                  | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam.py b/tests/integration-tests/tests/performance_tests/test_openfoam.py
index d22e6371de..15e284db7b 100644
--- a/tests/integration-tests/tests/performance_tests/test_openfoam.py
+++ b/tests/integration-tests/tests/performance_tests/test_openfoam.py
@@ -39,7 +39,6 @@ def run_openfoam_test(remote_command_executor, test_datadir, number_of_nodes):
     logging.info(f"Submitting OpenFOAM job with {number_of_nodes} nodes")
     remote_command_executor.run_remote_command(
         f'bash openfoam.slurm.sh "{subspace_benchmarks_dir}" "{number_of_nodes}" 2>&1',
-        additional_files=[str(test_datadir / "openfoam.slurm.sh")],
         timeout=OPENFOAM_JOB_TIMEOUT,
     )
     perf_test_result = remote_command_executor.run_remote_script(
@@ -78,6 +77,8 @@ def test_openfoam(
     logging.info("OpenFOAM Installed")
     performance_degradation = {}
 
+    # Copy additional files in advanced to avoid conflict when running 8 and 16 nodes tests in parallel
+    remote_command_executor._copy_additional_files([str(test_datadir / "openfoam.slurm.sh")])
     # Run 8 and 16 node tests in parallel
     with ThreadPoolExecutor(max_workers=2) as executor:
         future_8 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 8)
diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py
index f43ce99fb6..a98014a265 100644
--- a/tests/integration-tests/tests/performance_tests/test_starccm.py
+++ b/tests/integration-tests/tests/performance_tests/test_starccm.py
@@ -51,8 +51,7 @@ def starccm_installed(headnode):
 def run_starccm_test(remote_command_executor, scheduler_commands, test_datadir, number_of_nodes, podkey, licpath):
     num_of_tasks = number_of_nodes * TASK_VCPUS
     result = remote_command_executor.run_remote_command(
-        f'sbatch --ntasks={num_of_tasks} starccm.slurm.sh "{podkey}" "{licpath}"',
-        additional_files=[str(test_datadir / "starccm.slurm.sh")],
+        f'sbatch --ntasks={num_of_tasks} starccm.slurm.sh "{podkey}" "{licpath}"'
     )
     logging.info(f"Submitting StarCCM+ job with {number_of_nodes} nodes")
     job_id = scheduler_commands.assert_job_submitted(result.stdout)
@@ -108,6 +107,8 @@ def test_starccm(
     podkey, licpath = get_starccm_secrets(region)
     performance_degradation = {}
 
+    # Copy additional files in advanced to avoid conflict when running 8 and 16 nodes tests in parallel
+    remote_command_executor._copy_additional_files([str(test_datadir / "starccm.slurm.sh")])
     # Run 8 and 16 node tests in parallel
     with ThreadPoolExecutor(max_workers=2) as executor:
         future_8 = executor.submit(

From 66c628b97c6809776b558935507beb5e48005d23 Mon Sep 17 00:00:00 2001
From: Xuanqi He <xuanqihe@amazon.com>
Date: Tue, 25 Jun 2024 22:23:59 -0400
Subject: [PATCH 04/14] Group test_starccm and openfoam, and improve logging
 message output

- Move duplicated codes to common file
- Modify two tests' config files to make sure they are the same and can be used by both tests
- Add dependencies.install.sh in openfoam testdir as well to avoid failures
- Create a new shared_performance_test_cluster fixture, to group two tests in the same stack
- Now, if openfoam and starccm are not installed, it will not log errors
- Now, when percentage_difference is 0, will log matching baseline message
---
 .../tests/performance_tests/common.py         | 27 ++++++++++++
 .../tests/performance_tests/conftest.py       | 41 ++++++++++++++++++
 .../tests/performance_tests/test_openfoam.py  | 34 ++++-----------
 .../test_openfoam/dependencies.install.sh     |  8 ++++
 .../test_openfoam/pcluster.config.yaml        | 22 +++++++++-
 .../tests/performance_tests/test_starccm.py   | 42 +++----------------
 .../test_starccm/pcluster.config.yaml         |  7 ++++
 7 files changed, 117 insertions(+), 64 deletions(-)
 create mode 100644 tests/integration-tests/tests/performance_tests/conftest.py
 create mode 100644 tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/dependencies.install.sh

diff --git a/tests/integration-tests/tests/performance_tests/common.py b/tests/integration-tests/tests/performance_tests/common.py
index 9e49f9dd85..ac7384875f 100644
--- a/tests/integration-tests/tests/performance_tests/common.py
+++ b/tests/integration-tests/tests/performance_tests/common.py
@@ -27,6 +27,7 @@
 PYTEST_PARAMETERIZE_VALUES = [(NUM_COMPUTE_NODES, 1)]
 TEST_RUNNER_SCRIPT = "/shared/assets/workloads/scale-test/run-scale-test.sh"
 ROUND_UP_FACTOR = 100_000_000
+PERF_TEST_DIFFERENCE_TOLERANCE = 3
 
 METRICS = [
     dict(name="jobRunTime", unit="ms"),
@@ -222,3 +223,29 @@ def write_results_to_output_dir(
         paths["baseline"]["statistics.json"],
         paths[candidate_configuration]["statistics.json"],
     )
+
+
+def perf_test_difference(observed_value, baseline_value):
+    percentage_difference = 100 * (observed_value - baseline_value) / baseline_value
+    return percentage_difference
+
+
+def _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value):
+    percentage_difference = perf_test_difference(observed_value, baseline_value)
+    if percentage_difference < 0:
+        outcome = "improvement"
+    elif percentage_difference == 0:
+        outcome = "matching baseline"
+    elif percentage_difference <= PERF_TEST_DIFFERENCE_TOLERANCE:
+        outcome = "degradation (within tolerance)"
+    else:
+        outcome = "degradation (above tolerance)"
+        performance_degradation[node] = {
+            "baseline": baseline_value,
+            "observed": observed_value,
+            "percentage_difference": percentage_difference,
+        }
+    logging.info(
+        f"Nodes: {node}, Baseline: {baseline_value} seconds, Observed: {observed_value} seconds, "
+        f"Percentage difference: {percentage_difference}%, Outcome: {outcome}"
+    )
diff --git a/tests/integration-tests/tests/performance_tests/conftest.py b/tests/integration-tests/tests/performance_tests/conftest.py
new file mode 100644
index 0000000000..fd2705a98d
--- /dev/null
+++ b/tests/integration-tests/tests/performance_tests/conftest.py
@@ -0,0 +1,41 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file.
+# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+import logging
+
+import boto3
+import pytest
+
+OSS_REQUIRING_EXTRA_DEPS = ["alinux2023", "rhel8", "rocky8"]
+NUMBER_OF_NODES = [8, 16, 32]
+
+
+@pytest.fixture(scope="session")
+def shared_performance_test_cluster(
+    vpc_stack, pcluster_config_reader, clusters_factory, test_datadir, s3_bucket_factory
+):
+
+    def _shared_performance_test_cluster(instance, os, region, scheduler):
+        bucket_name = s3_bucket_factory()
+        s3 = boto3.client("s3")
+        s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh")
+
+        cluster_config = pcluster_config_reader(
+            bucket_name=bucket_name,
+            install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS,
+            number_of_nodes=max(NUMBER_OF_NODES),
+        )
+        cluster = clusters_factory(cluster_config)
+        logging.info("Cluster Created")
+        return cluster
+
+    return _shared_performance_test_cluster
diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam.py b/tests/integration-tests/tests/performance_tests/test_openfoam.py
index 15e284db7b..0538db7f2b 100644
--- a/tests/integration-tests/tests/performance_tests/test_openfoam.py
+++ b/tests/integration-tests/tests/performance_tests/test_openfoam.py
@@ -4,6 +4,8 @@
 import pytest
 from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor
 
+from tests.performance_tests.common import _log_output_performance_difference
+
 # timeout in seconds
 OPENFOAM_INSTALLATION_TIMEOUT = 300
 OPENFOAM_JOB_TIMEOUT = 5400  # Takes long time because during the first time, it's not only execute the job but also
@@ -17,20 +19,15 @@
     "rhel8": {8: 742, 16: 376, 32: 185},  # v3.6.0 just a placeholder, RHEL8 not supported
     "rocky8": {8: 742, 16: 376, 32: 185},  # v3.8.0 just a placeholder, Rocky8 not supported
 }
-PERF_TEST_DIFFERENCE_TOLERANCE = 3
-
-
-def perf_test_difference(observed_value, baseline_value):
-    percentage_difference = 100 * (observed_value - baseline_value) / baseline_value
-    return percentage_difference
 
 
 def openfoam_installed(headnode):
     cmd = '[ -d "/shared/SubspaceBenchmarks" ]'
     try:
-        headnode.run_remote_command(cmd)
+        headnode.run_remote_command(cmd, log_error=False)
         return True
     except RemoteCommandExecutionError:
+        logging.info("OpenFOAM is not installed on the head node.")
         return False
 
 
@@ -61,13 +58,11 @@ def test_openfoam(
     region,
     scheduler,
     pcluster_config_reader,
-    clusters_factory,
+    shared_performance_test_cluster,
     number_of_nodes,
     test_datadir,
 ):
-    cluster_config = pcluster_config_reader(number_of_nodes=max(number_of_nodes))
-    cluster = clusters_factory(cluster_config)
-    logging.info("Cluster Created")
+    cluster = shared_performance_test_cluster(instance, os, region, scheduler)
     remote_command_executor = RemoteCommandExecutor(cluster)
     if not openfoam_installed(remote_command_executor):
         logging.info("Installing OpenFOAM")
@@ -92,22 +87,7 @@ def test_openfoam(
     # Check results and log performance degradation
     for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]):
         baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node]
-        percentage_difference = perf_test_difference(observed_value, baseline_value)
-        if percentage_difference < 0:
-            outcome = "improvement"
-        elif percentage_difference <= PERF_TEST_DIFFERENCE_TOLERANCE:
-            outcome = "degradation (within tolerance)"
-        else:
-            outcome = "degradation (above tolerance)"
-            performance_degradation[node] = {
-                "baseline": baseline_value,
-                "observed": observed_value,
-                "percentage_difference": percentage_difference,
-            }
-        logging.info(
-            f"Nodes: {node}, Baseline: {baseline_value} seconds, Observed: {observed_value} seconds, "
-            f"Percentage difference: {percentage_difference}%, Outcome: {outcome}"
-        )
+        _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value)
 
     if performance_degradation:
         pytest.fail(f"Performance degradation detected: {performance_degradation}")
diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/dependencies.install.sh b/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/dependencies.install.sh
new file mode 100644
index 0000000000..e109f8583d
--- /dev/null
+++ b/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/dependencies.install.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# This script installs the necessary software stack for StarCCM+.
+# Note: The same cluster is shared by both test_openfoam and test_starccm.
+# The cluster will be created by whichever test (test_openfoam or test_starccm) is executed first.
+# If test_openfoam is executed first, it will also need to install the required dependencies.
+set -ex
+
+sudo yum install -y libnsl
diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml
index bf0ea9a2e5..0fce058376 100644
--- a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml
+++ b/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml
@@ -16,12 +16,23 @@ HeadNode:
       - BucketName: performance-tests-resources-for-parallelcluster
         KeyName: openfoam/*
         EnableWriteAccess: false
+      - BucketName: performance-tests-resources-for-parallelcluster
+        KeyName: starccm/*
+        EnableWriteAccess: false
+{% if install_extra_deps %}
+      - BucketName: {{ bucket_name }}
+        KeyName: scripts/dependencies.install.sh
+        EnableWriteAccess: false
+  CustomActions:
+    OnNodeConfigured:
+      Script: s3://{{ bucket_name }}/scripts/dependencies.install.sh
+{% endif %}
 Scheduling:
   Scheduler: slurm
   SlurmQueues:
     - Name: q1
       ComputeResources:
-        - Name: c5n18xl-efa
+        - Name: c5n-18xl-efa
           InstanceType: {{ instance }}
           MinCount: {{ number_of_nodes }}
           MaxCount: {{ number_of_nodes }}
@@ -37,6 +48,15 @@ Scheduling:
       Iam:
         AdditionalIamPolicies:
           - Policy: arn:{{partition}}:iam::aws:policy/AmazonSSMManagedInstanceCore # Required to report patching status
+{% if install_extra_deps %}
+        S3Access:
+          - BucketName: {{ bucket_name }}
+            KeyName: scripts/dependencies.install.sh
+            EnableWriteAccess: false
+      CustomActions:
+        OnNodeConfigured:
+          Script: s3://{{ bucket_name }}/scripts/dependencies.install.sh
+{% endif %}
 SharedStorage:
   - MountDir: /shared
     Name: shared-fsx
diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py
index a98014a265..53905d8336 100644
--- a/tests/integration-tests/tests/performance_tests/test_starccm.py
+++ b/tests/integration-tests/tests/performance_tests/test_starccm.py
@@ -7,6 +7,7 @@
 from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor
 
 from tests.common.utils import assert_no_file_handler_leak, get_compute_ip_to_num_files
+from tests.performance_tests.common import _log_output_performance_difference
 
 # timeout in seconds
 STARCCM_INSTALLATION_TIMEOUT = 1800
@@ -22,7 +23,6 @@
     "rhel8": {8: 66.494, 16: 36.154, 32: 20.347},  # v3.6.0
     "rocky8": {8: 66.859, 16: 36.184, 32: 21.090},  # v3.8.0
 }
-PERF_TEST_DIFFERENCE_TOLERANCE = 3
 
 OSS_REQUIRING_EXTRA_DEPS = ["alinux2023", "rhel8", "rocky8"]
 
@@ -34,17 +34,13 @@ def get_starccm_secrets(region_name):
     return secrets["podkey"], secrets["licpath"]
 
 
-def perf_test_difference(observed_value, baseline_value):
-    percentage_difference = 100 * (observed_value - baseline_value) / baseline_value
-    return percentage_difference
-
-
 def starccm_installed(headnode):
     cmd = "/shared/STAR-CCM+/18.02.008/STAR-CCM+18.02.008/star/bin/starccm+ --version"
     try:
-        headnode.run_remote_command(cmd)
+        headnode.run_remote_command(cmd, log_error=False)
         return True
     except RemoteCommandExecutionError:
+        logging.info("STAR-CCM+ is not installed on the head node.")
         return False
 
 
@@ -76,24 +72,13 @@ def test_starccm(
     region,
     scheduler,
     pcluster_config_reader,
-    clusters_factory,
+    shared_performance_test_cluster,
     number_of_nodes,
     test_datadir,
     scheduler_commands_factory,
     s3_bucket_factory,
 ):
-    # Create S3 bucket for custom actions scripts
-    bucket_name = s3_bucket_factory()
-    s3 = boto3.client("s3")
-    s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh")
-
-    cluster_config = pcluster_config_reader(
-        bucket_name=bucket_name,
-        install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS,
-        number_of_nodes=max(number_of_nodes),
-    )
-    cluster = clusters_factory(cluster_config)
-    logging.info("Cluster Created")
+    cluster = shared_performance_test_cluster(instance, os, region, scheduler)
     remote_command_executor = RemoteCommandExecutor(cluster)
     scheduler_commands = scheduler_commands_factory(remote_command_executor)
     init_num_files = get_compute_ip_to_num_files(remote_command_executor, scheduler_commands)
@@ -126,22 +111,7 @@ def test_starccm(
     # Check results and log performance degradation
     for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]):
         baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node]
-        percentage_difference = perf_test_difference(observed_value, baseline_value)
-        if percentage_difference < 0:
-            outcome = "improvement"
-        elif percentage_difference <= PERF_TEST_DIFFERENCE_TOLERANCE:
-            outcome = "degradation (within tolerance)"
-        else:
-            outcome = "degradation (above tolerance)"
-            performance_degradation[node] = {
-                "baseline": baseline_value,
-                "observed": observed_value,
-                "percentage_difference": percentage_difference,
-            }
-        logging.info(
-            f"Nodes: {node}, Baseline: {baseline_value} seconds, Observed: {observed_value} seconds, "
-            f"Percentage difference: {percentage_difference}%, Outcome: {outcome}"
-        )
+        _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value)
 
     assert_no_file_handler_leak(init_num_files, remote_command_executor, scheduler_commands)
 
diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml
index 3c456e7a13..6a317a9767 100644
--- a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml
+++ b/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml
@@ -1,6 +1,8 @@
 Region: {{ region }}
 Image:
   Os: {{ os }}
+Imds:
+  ImdsSupport: v2.0
 HeadNode:
   InstanceType: {{ instance }}
   Networking:
@@ -11,6 +13,9 @@ HeadNode:
     AdditionalIamPolicies:
       - Policy: arn:{{partition}}:iam::aws:policy/AmazonSSMManagedInstanceCore #Required to report patching status
     S3Access:
+      - BucketName: performance-tests-resources-for-parallelcluster
+        KeyName: openfoam/*
+        EnableWriteAccess: false
       - BucketName: performance-tests-resources-for-parallelcluster
         KeyName: starccm/*
         EnableWriteAccess: false
@@ -59,5 +64,7 @@ SharedStorage:
     FsxLustreSettings:
       StorageCapacity: 2400
       DeploymentType: PERSISTENT_1
+      AutomaticBackupRetentionDays: 30
+      DailyAutomaticBackupStartTime: 00:00
       PerUnitStorageThroughput: 100
       StorageType: SSD

From 8518d79deff15ed9657f43c26c21e331564ad17c Mon Sep 17 00:00:00 2001
From: Xuanqi He <xuanqihe@amazon.com>
Date: Tue, 25 Jun 2024 22:50:09 -0400
Subject: [PATCH 05/14] Change the shared_performance_test_cluster scope from
 session to class

---
 tests/integration-tests/tests/performance_tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration-tests/tests/performance_tests/conftest.py b/tests/integration-tests/tests/performance_tests/conftest.py
index fd2705a98d..1cb8f7b114 100644
--- a/tests/integration-tests/tests/performance_tests/conftest.py
+++ b/tests/integration-tests/tests/performance_tests/conftest.py
@@ -19,7 +19,7 @@
 NUMBER_OF_NODES = [8, 16, 32]
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="class")
 def shared_performance_test_cluster(
     vpc_stack, pcluster_config_reader, clusters_factory, test_datadir, s3_bucket_factory
 ):

From 84ba647b5d154b261d25507b5fe38a921cefa12c Mon Sep 17 00:00:00 2001
From: Xuanqi He <xuanqihe@amazon.com>
Date: Wed, 26 Jun 2024 10:06:14 -0400
Subject: [PATCH 06/14] Add shared fixture to test

---
 .../tests/performance_tests/conftest.py       | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/tests/integration-tests/tests/performance_tests/conftest.py b/tests/integration-tests/tests/performance_tests/conftest.py
index 1cb8f7b114..b5fc7fa689 100644
--- a/tests/integration-tests/tests/performance_tests/conftest.py
+++ b/tests/integration-tests/tests/performance_tests/conftest.py
@@ -11,9 +11,15 @@
 # See the License for the specific language governing permissions and limitations under the License.
 
 import logging
+import os
 
 import boto3
 import pytest
+from jinja2 import FileSystemLoader
+from jinja2.sandbox import SandboxedEnvironment
+
+from conftest import _get_default_template_values, inject_additional_config_settings, \
+    inject_additional_image_configs_settings
 
 OSS_REQUIRING_EXTRA_DEPS = ["alinux2023", "rhel8", "rocky8"]
 NUMBER_OF_NODES = [8, 16, 32]
@@ -39,3 +45,55 @@ def _shared_performance_test_cluster(instance, os, region, scheduler):
         return cluster
 
     return _shared_performance_test_cluster
+
+
+@pytest.fixture(scope="class")
+def shared_test_datadir(request, datadir):
+    """
+    Inject the datadir with resources for the specific test function.
+
+    If the test function is declared in a class then datadir is ClassName/FunctionName
+    otherwise it is only FunctionName.
+    """
+    function_name = request.function.__name__
+    if not request.cls:
+        return datadir / function_name
+
+    class_name = request.cls.__name__
+    return datadir / "{0}/{1}".format(class_name, function_name)
+
+
+@pytest.fixture(scope="class")
+def shared_pcluster_config_reader(test_datadir, vpc_stack, request, region):
+    """
+    Define a fixture to render pcluster config templates associated to the running test.
+
+    The config for a given test is a pcluster.config.yaml file stored in the configs_datadir folder.
+    The config can be written by using Jinja2 template engine.
+    The current renderer already replaces placeholders for current keys:
+        {{ region }}, {{ os }}, {{ instance }}, {{ scheduler}}, {{ key_name }},
+        {{ vpc_id }}, {{ public_subnet_id }}, {{ private_subnet_id }}, {{ default_vpc_security_group_id }}
+    The current renderer injects options for custom templates and packages in case these
+    are passed to the cli and not present already in the cluster config.
+    Also sanity_check is set to true by default unless explicitly set in config.
+
+    :return: a _config_renderer(**kwargs) function which gets as input a dictionary of values to replace in the template
+    """
+
+    def _config_renderer(config_file="pcluster.config.yaml", benchmarks=None, output_file=None, **kwargs):
+        config_file_path = test_datadir / config_file
+        if not os.path.isfile(config_file_path):
+            raise FileNotFoundError(f"Cluster config file not found in the expected dir {config_file_path}")
+        output_file_path = test_datadir / output_file if output_file else config_file_path
+        default_values = _get_default_template_values(vpc_stack, request)
+        file_loader = FileSystemLoader(str(test_datadir))
+        env = SandboxedEnvironment(loader=file_loader)
+        rendered_template = env.get_template(config_file).render(**{**default_values, **kwargs})
+        output_file_path.write_text(rendered_template)
+        if not config_file.endswith("image.config.yaml"):
+            inject_additional_config_settings(output_file_path, request, region, benchmarks)
+        else:
+            inject_additional_image_configs_settings(output_file_path, request)
+        return output_file_path
+
+    return _config_renderer

From e3210d63d17cc4e48a0ee87b48e7452362afbe97 Mon Sep 17 00:00:00 2001
From: Xuanqi He <xuanqihe@amazon.com>
Date: Wed, 26 Jun 2024 16:28:32 -0400
Subject: [PATCH 07/14] Use shared fixture in shared cluster fixture for now to
 test, will modify in the future

---
 tests/integration-tests/tests/performance_tests/conftest.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/integration-tests/tests/performance_tests/conftest.py b/tests/integration-tests/tests/performance_tests/conftest.py
index b5fc7fa689..2176ee91a2 100644
--- a/tests/integration-tests/tests/performance_tests/conftest.py
+++ b/tests/integration-tests/tests/performance_tests/conftest.py
@@ -27,15 +27,15 @@
 
 @pytest.fixture(scope="class")
 def shared_performance_test_cluster(
-    vpc_stack, pcluster_config_reader, clusters_factory, test_datadir, s3_bucket_factory
+    vpc_stack, shared_pcluster_config_reader, clusters_factory, shared_test_datadir, s3_bucket_factory
 ):
 
     def _shared_performance_test_cluster(instance, os, region, scheduler):
         bucket_name = s3_bucket_factory()
         s3 = boto3.client("s3")
-        s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh")
+        s3.upload_file(str(shared_test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh")
 
-        cluster_config = pcluster_config_reader(
+        cluster_config = shared_pcluster_config_reader(
             bucket_name=bucket_name,
             install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS,
             number_of_nodes=max(NUMBER_OF_NODES),

From c5d59056459cd474c014ce9ce854342fe2c93637 Mon Sep 17 00:00:00 2001
From: Xuanqi He <xuanqihe@amazon.com>
Date: Sun, 30 Jun 2024 21:23:09 -0400
Subject: [PATCH 08/14] Restructure. Group two tests in a moudule, restructure
 files to adopt changes.

---
 tests/integration-tests/conftest.py           |  81 +++++++++++-
 .../tests/performance_tests/conftest.py       |  99 --------------
 .../tests/performance_tests/test_openfoam.py  |  95 --------------
 ...tarccm.py => test_starccm_and_openfoam.py} | 122 +++++++++++++++++-
 .../test_openfoam/dependencies.install.sh     |   0
 .../test_openfoam/openfoam.install.sh         |   0
 .../test_openfoam/openfoam.results.sh         |   0
 .../test_openfoam/openfoam.slurm.sh           |   0
 .../test_openfoam/pcluster.config.yaml        |   0
 .../test_starccm/dependencies.install.sh      |   0
 .../test_starccm/pcluster.config.yaml         |   0
 .../test_starccm/starccm.install.sh           |   0
 .../test_starccm/starccm.results.sh           |   0
 .../test_starccm/starccm.slurm.sh             |   0
 14 files changed, 197 insertions(+), 200 deletions(-)
 delete mode 100644 tests/integration-tests/tests/performance_tests/conftest.py
 delete mode 100644 tests/integration-tests/tests/performance_tests/test_openfoam.py
 rename tests/integration-tests/tests/performance_tests/{test_starccm.py => test_starccm_and_openfoam.py} (50%)
 rename tests/integration-tests/tests/performance_tests/{test_openfoam => test_starccm_and_openfoam}/test_openfoam/dependencies.install.sh (100%)
 rename tests/integration-tests/tests/performance_tests/{test_openfoam => test_starccm_and_openfoam}/test_openfoam/openfoam.install.sh (100%)
 rename tests/integration-tests/tests/performance_tests/{test_openfoam => test_starccm_and_openfoam}/test_openfoam/openfoam.results.sh (100%)
 rename tests/integration-tests/tests/performance_tests/{test_openfoam => test_starccm_and_openfoam}/test_openfoam/openfoam.slurm.sh (100%)
 rename tests/integration-tests/tests/performance_tests/{test_openfoam => test_starccm_and_openfoam}/test_openfoam/pcluster.config.yaml (100%)
 rename tests/integration-tests/tests/performance_tests/{test_starccm => test_starccm_and_openfoam}/test_starccm/dependencies.install.sh (100%)
 rename tests/integration-tests/tests/performance_tests/{test_starccm => test_starccm_and_openfoam}/test_starccm/pcluster.config.yaml (100%)
 rename tests/integration-tests/tests/performance_tests/{test_starccm => test_starccm_and_openfoam}/test_starccm/starccm.install.sh (100%)
 rename tests/integration-tests/tests/performance_tests/{test_starccm => test_starccm_and_openfoam}/test_starccm/starccm.results.sh (100%)
 rename tests/integration-tests/tests/performance_tests/{test_starccm => test_starccm_and_openfoam}/test_starccm/starccm.slurm.sh (100%)

diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py
index a25cfc5f59..c2ad5151e5 100644
--- a/tests/integration-tests/conftest.py
+++ b/tests/integration-tests/conftest.py
@@ -21,6 +21,7 @@
 from functools import partial
 from itertools import product
 from shutil import copyfile
+from time import sleep
 from traceback import format_tb
 from typing import Any, Dict, List, Optional, Union
 
@@ -366,6 +367,70 @@ def _setup_custom_logger(log_file):
     logger.addHandler(file_handler)
 
 
+class SharedClusterDetectionTimeoutError(Exception):
+    """Custom exception for shared cluster detection timeout."""
+
+    pass
+
+
+@pytest.fixture(scope="module")
+@pytest.mark.usefixtures("setup_credentials")
+def shared_clusters_factory(request):
+    """
+    Define a fixture to manage the creation and destruction of module shared clusters.
+
+    The configs used to create clusters are dumped to output_dir/clusters_configs/{test_name}.config
+    """
+    factory = ClustersFactory(delete_logs_on_success=request.config.getoption("delete_logs_on_success"))
+
+    if not hasattr(request.module, "shared_existing_cluster"):
+        request.module.shared_existing_cluster = None
+        request.module.is_cluster_started_to_create = False
+
+    def _cluster_factory(cluster_config, region, upper_case_cluster_name=False, custom_cli_credentials=None, **kwargs):
+        if request.module.is_cluster_started_to_create:
+            for retry in range(40):
+                if request.module.shared_existing_cluster:
+                    logging.info(f"Shared cluster {request.module.shared_existing_cluster.name} detected.")
+                    return request.module.shared_existing_cluster
+                else:
+                    logging.info(f"Shared cluster not detected yet. Retrying... ({retry + 1}/40)")
+                    sleep(60)
+            raise SharedClusterDetectionTimeoutError(
+                "Timeout: Failed to detect the shared cluster within the allowed retries."
+            )
+
+        request.module.is_cluster_started_to_create = True
+        cluster_config = _write_config_to_outdir(request, cluster_config, "clusters_configs")
+        cluster = Cluster(
+            name=(
+                request.config.getoption("cluster")
+                if request.config.getoption("cluster")
+                else "integ-tests-{0}{1}{2}".format(
+                    random_alphanumeric().upper() if upper_case_cluster_name else random_alphanumeric(),
+                    "-" if request.config.getoption("stackname_suffix") else "",
+                    request.config.getoption("stackname_suffix"),
+                )
+            ),
+            config_file=cluster_config,
+            ssh_key=request.config.getoption("key_path"),
+            region=region,
+            custom_cli_credentials=custom_cli_credentials,
+        )
+        if not request.config.getoption("cluster"):
+            cluster.creation_response = factory.create_cluster(cluster, **kwargs)
+        request.module.shared_existing_cluster = cluster
+        return cluster
+
+    yield _cluster_factory
+    if not request.config.getoption("no_delete"):
+        try:
+            test_passed = request.node.rep_call.passed
+        except AttributeError:
+            test_passed = False
+        factory.destroy_all_clusters(test_passed=test_passed)
+
+
 @pytest.fixture(scope="class")
 @pytest.mark.usefixtures("setup_credentials")
 def clusters_factory(request, region):
@@ -509,9 +574,21 @@ def _write_config_to_outdir(request, config, config_dir):
     out_dir = request.config.getoption("output_dir")
 
     # Sanitize config file name to make it Windows compatible
-    # request.node.nodeid example:
+    # class scope request.node.nodeid example:
     # 'dcv/test_dcv.py::test_dcv_configuration[eu-west-1-c5.xlarge-centos7-slurm-8443-0.0.0.0/0-/shared]'
-    test_file, test_name = request.node.nodeid.split("::", 1)
+    # module scope request.node.nodeid example:
+    # 'performance_tests/test_starccm_and_openfoam.py'
+    # TODO: Find a better way to name module_scope_test
+    logging.info(f"request.node.nodeid: {request.node.nodeid}")
+    nodeid_parts = request.node.nodeid.split("::")
+    if len(nodeid_parts) == 2:
+        test_file, test_name = nodeid_parts
+    elif len(nodeid_parts) == 1:
+        test_file = nodeid_parts[0]
+        test_name = "module_scope_test"
+    else:
+        raise ValueError(f"Unexpected nodeid format: {request.node.nodeid}")
+
     config_file_name = "{0}-{1}".format(test_file, test_name.replace("/", "_"))
 
     os.makedirs(
diff --git a/tests/integration-tests/tests/performance_tests/conftest.py b/tests/integration-tests/tests/performance_tests/conftest.py
deleted file mode 100644
index 2176ee91a2..0000000000
--- a/tests/integration-tests/tests/performance_tests/conftest.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License").
-# You may not use this file except in compliance with the License.
-# A copy of the License is located at
-#
-# http://aws.amazon.com/apache2.0/
-#
-# or in the "LICENSE.txt" file accompanying this file.
-# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
-# See the License for the specific language governing permissions and limitations under the License.
-
-import logging
-import os
-
-import boto3
-import pytest
-from jinja2 import FileSystemLoader
-from jinja2.sandbox import SandboxedEnvironment
-
-from conftest import _get_default_template_values, inject_additional_config_settings, \
-    inject_additional_image_configs_settings
-
-OSS_REQUIRING_EXTRA_DEPS = ["alinux2023", "rhel8", "rocky8"]
-NUMBER_OF_NODES = [8, 16, 32]
-
-
-@pytest.fixture(scope="class")
-def shared_performance_test_cluster(
-    vpc_stack, shared_pcluster_config_reader, clusters_factory, shared_test_datadir, s3_bucket_factory
-):
-
-    def _shared_performance_test_cluster(instance, os, region, scheduler):
-        bucket_name = s3_bucket_factory()
-        s3 = boto3.client("s3")
-        s3.upload_file(str(shared_test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh")
-
-        cluster_config = shared_pcluster_config_reader(
-            bucket_name=bucket_name,
-            install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS,
-            number_of_nodes=max(NUMBER_OF_NODES),
-        )
-        cluster = clusters_factory(cluster_config)
-        logging.info("Cluster Created")
-        return cluster
-
-    return _shared_performance_test_cluster
-
-
-@pytest.fixture(scope="class")
-def shared_test_datadir(request, datadir):
-    """
-    Inject the datadir with resources for the specific test function.
-
-    If the test function is declared in a class then datadir is ClassName/FunctionName
-    otherwise it is only FunctionName.
-    """
-    function_name = request.function.__name__
-    if not request.cls:
-        return datadir / function_name
-
-    class_name = request.cls.__name__
-    return datadir / "{0}/{1}".format(class_name, function_name)
-
-
-@pytest.fixture(scope="class")
-def shared_pcluster_config_reader(test_datadir, vpc_stack, request, region):
-    """
-    Define a fixture to render pcluster config templates associated to the running test.
-
-    The config for a given test is a pcluster.config.yaml file stored in the configs_datadir folder.
-    The config can be written by using Jinja2 template engine.
-    The current renderer already replaces placeholders for current keys:
-        {{ region }}, {{ os }}, {{ instance }}, {{ scheduler}}, {{ key_name }},
-        {{ vpc_id }}, {{ public_subnet_id }}, {{ private_subnet_id }}, {{ default_vpc_security_group_id }}
-    The current renderer injects options for custom templates and packages in case these
-    are passed to the cli and not present already in the cluster config.
-    Also sanity_check is set to true by default unless explicitly set in config.
-
-    :return: a _config_renderer(**kwargs) function which gets as input a dictionary of values to replace in the template
-    """
-
-    def _config_renderer(config_file="pcluster.config.yaml", benchmarks=None, output_file=None, **kwargs):
-        config_file_path = test_datadir / config_file
-        if not os.path.isfile(config_file_path):
-            raise FileNotFoundError(f"Cluster config file not found in the expected dir {config_file_path}")
-        output_file_path = test_datadir / output_file if output_file else config_file_path
-        default_values = _get_default_template_values(vpc_stack, request)
-        file_loader = FileSystemLoader(str(test_datadir))
-        env = SandboxedEnvironment(loader=file_loader)
-        rendered_template = env.get_template(config_file).render(**{**default_values, **kwargs})
-        output_file_path.write_text(rendered_template)
-        if not config_file.endswith("image.config.yaml"):
-            inject_additional_config_settings(output_file_path, request, region, benchmarks)
-        else:
-            inject_additional_image_configs_settings(output_file_path, request)
-        return output_file_path
-
-    return _config_renderer
diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam.py b/tests/integration-tests/tests/performance_tests/test_openfoam.py
deleted file mode 100644
index 0538db7f2b..0000000000
--- a/tests/integration-tests/tests/performance_tests/test_openfoam.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import logging
-from concurrent.futures.thread import ThreadPoolExecutor
-
-import pytest
-from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor
-
-from tests.performance_tests.common import _log_output_performance_difference
-
-# timeout in seconds
-OPENFOAM_INSTALLATION_TIMEOUT = 300
-OPENFOAM_JOB_TIMEOUT = 5400  # Takes long time because during the first time, it's not only execute the job but also
-# builds and installs many things
-TASK_VCPUS = 36  # vCPUs are cut in a half because multithreading is disabled
-BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS = {
-    "alinux2": {8: 754, 16: 366, 32: 182},  # v3.1.3
-    "ubuntu2204": {8: 742, 16: 376, 32: 185},  # v3.7.0 just a placeholder, Ubuntu22.04 not supported
-    "ubuntu2004": {8: 750, 16: 382, 32: 187},  # v3.1.3
-    "centos7": {8: 755, 16: 371, 32: 190},  # v3.1.3
-    "rhel8": {8: 742, 16: 376, 32: 185},  # v3.6.0 just a placeholder, RHEL8 not supported
-    "rocky8": {8: 742, 16: 376, 32: 185},  # v3.8.0 just a placeholder, Rocky8 not supported
-}
-
-
-def openfoam_installed(headnode):
-    cmd = '[ -d "/shared/SubspaceBenchmarks" ]'
-    try:
-        headnode.run_remote_command(cmd, log_error=False)
-        return True
-    except RemoteCommandExecutionError:
-        logging.info("OpenFOAM is not installed on the head node.")
-        return False
-
-
-def run_openfoam_test(remote_command_executor, test_datadir, number_of_nodes):
-    subspace_benchmarks_dir = "/shared/SubspaceBenchmarks"
-    logging.info(f"Submitting OpenFOAM job with {number_of_nodes} nodes")
-    remote_command_executor.run_remote_command(
-        f'bash openfoam.slurm.sh "{subspace_benchmarks_dir}" "{number_of_nodes}" 2>&1',
-        timeout=OPENFOAM_JOB_TIMEOUT,
-    )
-    perf_test_result = remote_command_executor.run_remote_script(
-        (str(test_datadir / "openfoam.results.sh")), hide=False
-    )
-    output = perf_test_result.stdout.strip()
-    observed_value = int(output.split("\n")[-1].strip())
-    logging.info(f"The elapsed time for {number_of_nodes} nodes is {observed_value} seconds")
-    return observed_value
-
-
-@pytest.mark.parametrize(
-    "number_of_nodes",
-    [[8, 16, 32]],
-)
-def test_openfoam(
-    vpc_stack,
-    instance,
-    os,
-    region,
-    scheduler,
-    pcluster_config_reader,
-    shared_performance_test_cluster,
-    number_of_nodes,
-    test_datadir,
-):
-    cluster = shared_performance_test_cluster(instance, os, region, scheduler)
-    remote_command_executor = RemoteCommandExecutor(cluster)
-    if not openfoam_installed(remote_command_executor):
-        logging.info("Installing OpenFOAM")
-        remote_command_executor.run_remote_script(
-            str(test_datadir / "openfoam.install.sh"), timeout=OPENFOAM_INSTALLATION_TIMEOUT, hide=False
-        )
-    logging.info("OpenFOAM Installed")
-    performance_degradation = {}
-
-    # Copy additional files in advanced to avoid conflict when running 8 and 16 nodes tests in parallel
-    remote_command_executor._copy_additional_files([str(test_datadir / "openfoam.slurm.sh")])
-    # Run 8 and 16 node tests in parallel
-    with ThreadPoolExecutor(max_workers=2) as executor:
-        future_8 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 8)
-        future_16 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 16)
-        observed_value_8 = future_8.result()
-        observed_value_16 = future_16.result()
-
-    # Run 32 node test
-    observed_value_32 = run_openfoam_test(remote_command_executor, test_datadir, 32)
-
-    # Check results and log performance degradation
-    for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]):
-        baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node]
-        _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value)
-
-    if performance_degradation:
-        pytest.fail(f"Performance degradation detected: {performance_degradation}")
-    else:
-        logging.info("Performance test results show no performance degradation")
diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py
similarity index 50%
rename from tests/integration-tests/tests/performance_tests/test_starccm.py
rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py
index 53905d8336..b02f579f4d 100644
--- a/tests/integration-tests/tests/performance_tests/test_starccm.py
+++ b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py
@@ -13,8 +13,12 @@
 STARCCM_INSTALLATION_TIMEOUT = 1800
 STARCCM_JOB_TIMEOUT = 600
 STARCCM_LICENCE_SECRET = "starccm-license-secret"
+
+OPENFOAM_INSTALLATION_TIMEOUT = 300
+OPENFOAM_JOB_TIMEOUT = 5400  # Takes long time because during the first time, it's not only execute the job but also
+
 TASK_VCPUS = 36  # vCPUs are cut in a half because multithreading is disabled
-BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS = {
+BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS_STARCCM = {
     "alinux2023": {8: 62.414, 16: 31.998, 32: 20.422},  # v3.10.0
     "alinux2": {8: 64.475, 16: 33.173, 32: 17.899},  # v3.1.3
     "ubuntu2204": {8: 75.502, 16: 36.353, 32: 19.688},  # v3.7.0
@@ -24,6 +28,15 @@
     "rocky8": {8: 66.859, 16: 36.184, 32: 21.090},  # v3.8.0
 }
 
+BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS_OPENFOAM = {
+    "alinux2": {8: 754, 16: 366, 32: 182},  # v3.1.3
+    "ubuntu2204": {8: 742, 16: 376, 32: 185},  # v3.7.0 just a placeholder, Ubuntu22.04 not supported
+    "ubuntu2004": {8: 750, 16: 382, 32: 187},  # v3.1.3
+    "centos7": {8: 755, 16: 371, 32: 190},  # v3.1.3
+    "rhel8": {8: 742, 16: 376, 32: 185},  # v3.6.0 just a placeholder, RHEL8 not supported
+    "rocky8": {8: 742, 16: 376, 32: 185},  # v3.8.0 just a placeholder, Rocky8 not supported
+}
+
 OSS_REQUIRING_EXTRA_DEPS = ["alinux2023", "rhel8", "rocky8"]
 
 
@@ -34,6 +47,32 @@ def get_starccm_secrets(region_name):
     return secrets["podkey"], secrets["licpath"]
 
 
+def openfoam_installed(headnode):
+    cmd = '[ -d "/shared/SubspaceBenchmarks" ]'
+    try:
+        headnode.run_remote_command(cmd, log_error=False)
+        return True
+    except RemoteCommandExecutionError:
+        logging.info("OpenFOAM is not installed on the head node.")
+        return False
+
+
+def run_openfoam_test(remote_command_executor, test_datadir, number_of_nodes):
+    subspace_benchmarks_dir = "/shared/SubspaceBenchmarks"
+    logging.info(f"Submitting OpenFOAM job with {number_of_nodes} nodes")
+    remote_command_executor.run_remote_command(
+        f'bash openfoam.slurm.sh "{subspace_benchmarks_dir}" "{number_of_nodes}" 2>&1',
+        timeout=OPENFOAM_JOB_TIMEOUT,
+    )
+    perf_test_result = remote_command_executor.run_remote_script(
+        (str(test_datadir / "openfoam.results.sh")), hide=False
+    )
+    output = perf_test_result.stdout.strip()
+    observed_value = int(output.split("\n")[-1].strip())
+    logging.info(f"The elapsed time for {number_of_nodes} nodes is {observed_value} seconds")
+    return observed_value
+
+
 def starccm_installed(headnode):
     cmd = "/shared/STAR-CCM+/18.02.008/STAR-CCM+18.02.008/star/bin/starccm+ --version"
     try:
@@ -72,13 +111,27 @@ def test_starccm(
     region,
     scheduler,
     pcluster_config_reader,
-    shared_performance_test_cluster,
+    shared_clusters_factory,
     number_of_nodes,
     test_datadir,
     scheduler_commands_factory,
     s3_bucket_factory,
 ):
-    cluster = shared_performance_test_cluster(instance, os, region, scheduler)
+    logging.info("start to create s3")
+    bucket_name = s3_bucket_factory()
+    s3 = boto3.client("s3")
+    s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh")
+
+    cluster_config = pcluster_config_reader(
+        bucket_name=bucket_name,
+        install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS,
+        number_of_nodes=max(number_of_nodes),
+    )
+    test_region = region
+    logging.info(f"test region is {test_region}")
+    cluster = shared_clusters_factory(cluster_config, test_region)
+    logging.info("Cluster Created")
+
     remote_command_executor = RemoteCommandExecutor(cluster)
     scheduler_commands = scheduler_commands_factory(remote_command_executor)
     init_num_files = get_compute_ip_to_num_files(remote_command_executor, scheduler_commands)
@@ -110,7 +163,7 @@ def test_starccm(
 
     # Check results and log performance degradation
     for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]):
-        baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node]
+        baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS_STARCCM[os][node]
         _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value)
 
     assert_no_file_handler_leak(init_num_files, remote_command_executor, scheduler_commands)
@@ -119,3 +172,64 @@ def test_starccm(
         pytest.fail(f"Performance degradation detected: {performance_degradation}")
     else:
         logging.info("Performance test results show no performance degradation")
+
+
+@pytest.mark.parametrize(
+    "number_of_nodes",
+    [[8, 16, 32]],
+)
+def test_openfoam(
+    vpc_stack,
+    instance,
+    os,
+    region,
+    scheduler,
+    pcluster_config_reader,
+    shared_clusters_factory,
+    number_of_nodes,
+    test_datadir,
+    s3_bucket_factory,
+):
+    bucket_name = s3_bucket_factory()
+    s3 = boto3.client("s3")
+    s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh")
+
+    cluster_config = pcluster_config_reader(
+        bucket_name=bucket_name,
+        install_extra_deps=os in number_of_nodes,
+        number_of_nodes=max(number_of_nodes),
+    )
+    test_region = region
+    logging.info(f"test region is {test_region}")
+    cluster = shared_clusters_factory(cluster_config, test_region)
+    logging.info("Cluster Created")
+    remote_command_executor = RemoteCommandExecutor(cluster)
+    if not openfoam_installed(remote_command_executor):
+        logging.info("Installing OpenFOAM")
+        remote_command_executor.run_remote_script(
+            str(test_datadir / "openfoam.install.sh"), timeout=OPENFOAM_INSTALLATION_TIMEOUT, hide=False
+        )
+    logging.info("OpenFOAM Installed")
+    performance_degradation = {}
+
+    # Copy additional files in advanced to avoid conflict when running 8 and 16 nodes tests in parallel
+    remote_command_executor._copy_additional_files([str(test_datadir / "openfoam.slurm.sh")])
+    # Run 8 and 16 node tests in parallel
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        future_8 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 8)
+        future_16 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 16)
+        observed_value_8 = future_8.result()
+        observed_value_16 = future_16.result()
+
+    # Run 32 node test
+    observed_value_32 = run_openfoam_test(remote_command_executor, test_datadir, 32)
+
+    # Check results and log performance degradation
+    for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]):
+        baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS_OPENFOAM[os][node]
+        _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value)
+
+    if performance_degradation:
+        pytest.fail(f"Performance degradation detected: {performance_degradation}")
+    else:
+        logging.info("Performance test results show no performance degradation")
diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/dependencies.install.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/dependencies.install.sh
similarity index 100%
rename from tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/dependencies.install.sh
rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/dependencies.install.sh
diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.install.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.install.sh
similarity index 100%
rename from tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.install.sh
rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.install.sh
diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.results.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.results.sh
similarity index 100%
rename from tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.results.sh
rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.results.sh
diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.slurm.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.slurm.sh
similarity index 100%
rename from tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.slurm.sh
rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.slurm.sh
diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/pcluster.config.yaml
similarity index 100%
rename from tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml
rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/pcluster.config.yaml
diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/dependencies.install.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/dependencies.install.sh
similarity index 100%
rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/dependencies.install.sh
rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/dependencies.install.sh
diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/pcluster.config.yaml
similarity index 100%
rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml
rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/pcluster.config.yaml
diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.install.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.install.sh
similarity index 100%
rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.install.sh
rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.install.sh
diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.results.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.results.sh
similarity index 100%
rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.results.sh
rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.results.sh
diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.slurm.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.slurm.sh
similarity index 100%
rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.slurm.sh
rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.slurm.sh

From ab38a3522188e75e55d8056df3fa69283592d0bc Mon Sep 17 00:00:00 2001
From: Xuanqi He <xuanqihe@amazon.com>
Date: Mon, 1 Jul 2024 16:05:45 -0400
Subject: [PATCH 09/14] Modify hasattr logic, add logging info to debug

---
 tests/integration-tests/conftest.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py
index c2ad5151e5..fe15d4cbea 100644
--- a/tests/integration-tests/conftest.py
+++ b/tests/integration-tests/conftest.py
@@ -373,6 +373,12 @@ class SharedClusterDetectionTimeoutError(Exception):
     pass
 
 
+class SharedClusterDetectionTimeoutError(Exception):
+    """Custom exception for shared cluster detection timeout."""
+
+    pass
+
+
 @pytest.fixture(scope="module")
 @pytest.mark.usefixtures("setup_credentials")
 def shared_clusters_factory(request):
@@ -383,11 +389,17 @@ def shared_clusters_factory(request):
     """
     factory = ClustersFactory(delete_logs_on_success=request.config.getoption("delete_logs_on_success"))
 
-    if not hasattr(request.module, "shared_existing_cluster"):
-        request.module.shared_existing_cluster = None
+    if not hasattr(request.module, "is_cluster_started_to_create"):
+        logging.info("Setting is_cluster_started_to_create and shared_existing_cluster")
         request.module.is_cluster_started_to_create = False
+        request.module.shared_existing_cluster = None
 
     def _cluster_factory(cluster_config, region, upper_case_cluster_name=False, custom_cli_credentials=None, **kwargs):
+        logging.info(
+            "Shared cluster already started to create"
+            if request.module.is_cluster_started_to_create
+            else "Start to create shared cluster"
+        )
         if request.module.is_cluster_started_to_create:
             for retry in range(40):
                 if request.module.shared_existing_cluster:

From 86345309fc2344da89372d9bdc667d7cac85bc23 Mon Sep 17 00:00:00 2001
From: Xuanqi He <xuanqihe@amazon.com>
Date: Tue, 2 Jul 2024 12:49:16 -0400
Subject: [PATCH 10/14] Delete dupicate errorexception

---
 tests/integration-tests/conftest.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py
index fe15d4cbea..70e0154fb2 100644
--- a/tests/integration-tests/conftest.py
+++ b/tests/integration-tests/conftest.py
@@ -369,13 +369,6 @@ def _setup_custom_logger(log_file):
 
 class SharedClusterDetectionTimeoutError(Exception):
     """Custom exception for shared cluster detection timeout."""
-
-    pass
-
-
-class SharedClusterDetectionTimeoutError(Exception):
-    """Custom exception for shared cluster detection timeout."""
-
     pass
 
 

From 04132f315f83e3ff24f2bf925eb0887610f473fa Mon Sep 17 00:00:00 2001
From: Xuanqi He <xuanqihe@amazon.com>
Date: Tue, 2 Jul 2024 18:44:16 -0400
Subject: [PATCH 11/14] Use xdist_session_fixture, use hashmap and set to adopt
 changes

---
 tests/integration-tests/conftest.py           | 38 ++++++++++---------
 .../test_starccm_and_openfoam.py              |  8 +---
 2 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py
index 70e0154fb2..ff8fe5c955 100644
--- a/tests/integration-tests/conftest.py
+++ b/tests/integration-tests/conftest.py
@@ -372,32 +372,33 @@ class SharedClusterDetectionTimeoutError(Exception):
     pass
 
 
-@pytest.fixture(scope="module")
+@xdist_session_fixture(autouse=True)
 @pytest.mark.usefixtures("setup_credentials")
 def shared_clusters_factory(request):
     """
-    Define a fixture to manage the creation and destruction of module shared clusters.
+    Define a fixture to manage the creation and destruction of session shared clusters.
 
     The configs used to create clusters are dumped to output_dir/clusters_configs/{test_name}.config
     """
     factory = ClustersFactory(delete_logs_on_success=request.config.getoption("delete_logs_on_success"))
 
-    if not hasattr(request.module, "is_cluster_started_to_create"):
-        logging.info("Setting is_cluster_started_to_create and shared_existing_cluster")
-        request.module.is_cluster_started_to_create = False
-        request.module.shared_existing_cluster = None
+    if not hasattr(request.session, "shared_existing_clusters"):
+        logging.info("Setting shared_existing_clusters_started_to_create and shared_existing_clusters")
+        request.session.shared_existing_clusters = {}
+        request.session.shared_existing_clusters_started_to_create = set()
 
-    def _cluster_factory(cluster_config, region, upper_case_cluster_name=False, custom_cli_credentials=None, **kwargs):
+    def _cluster_factory(cluster_config, region, instance, os, scheduler, upper_case_cluster_name=False, custom_cli_credentials=None, **kwargs):
+        cluster_key = f"{region}-{instance}-{os}-{scheduler}"
         logging.info(
-            "Shared cluster already started to create"
-            if request.module.is_cluster_started_to_create
-            else "Start to create shared cluster"
+            "Eligible for using shared cluster, start to detect."
+            if cluster_key in request.session.shared_existing_clusters_started_to_create
+            else "Start to create shared cluster for specific region, instance type, os and scheduler"
         )
-        if request.module.is_cluster_started_to_create:
+        if cluster_key in request.session.shared_existing_clusters_started_to_create:
             for retry in range(40):
-                if request.module.shared_existing_cluster:
-                    logging.info(f"Shared cluster {request.module.shared_existing_cluster.name} detected.")
-                    return request.module.shared_existing_cluster
+                if cluster_key in request.session.shared_existing_clusters:
+                    logging.info(f"Shared cluster {request.session.shared_existing_clusters[cluster_key].name} detected.")
+                    return request.session.shared_existing_clusters[cluster_key]
                 else:
                     logging.info(f"Shared cluster not detected yet. Retrying... ({retry + 1}/40)")
                     sleep(60)
@@ -405,7 +406,7 @@ def _cluster_factory(cluster_config, region, upper_case_cluster_name=False, cust
                 "Timeout: Failed to detect the shared cluster within the allowed retries."
             )
 
-        request.module.is_cluster_started_to_create = True
+        request.session.shared_existing_clusters_started_to_create.add(cluster_key)
         cluster_config = _write_config_to_outdir(request, cluster_config, "clusters_configs")
         cluster = Cluster(
             name=(
@@ -424,10 +425,11 @@ def _cluster_factory(cluster_config, region, upper_case_cluster_name=False, cust
         )
         if not request.config.getoption("cluster"):
             cluster.creation_response = factory.create_cluster(cluster, **kwargs)
-        request.module.shared_existing_cluster = cluster
+        request.session.shared_existing_clusters[cluster_key] = cluster
         return cluster
 
     yield _cluster_factory
+
     if not request.config.getoption("no_delete"):
         try:
             test_passed = request.node.rep_call.passed
@@ -583,14 +585,14 @@ def _write_config_to_outdir(request, config, config_dir):
     # 'dcv/test_dcv.py::test_dcv_configuration[eu-west-1-c5.xlarge-centos7-slurm-8443-0.0.0.0/0-/shared]'
     # module scope request.node.nodeid example:
     # 'performance_tests/test_starccm_and_openfoam.py'
-    # TODO: Find a better way to name module_scope_test
+    # TODO: Find a better way to name module_scope_test/session_scope_test
     logging.info(f"request.node.nodeid: {request.node.nodeid}")
     nodeid_parts = request.node.nodeid.split("::")
     if len(nodeid_parts) == 2:
         test_file, test_name = nodeid_parts
     elif len(nodeid_parts) == 1:
         test_file = nodeid_parts[0]
-        test_name = "module_scope_test"
+        test_name = "module_scope_test" + random_alphanumeric()
     else:
         raise ValueError(f"Unexpected nodeid format: {request.node.nodeid}")
 
diff --git a/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py
index b02f579f4d..fb76ecd234 100644
--- a/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py
+++ b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py
@@ -127,9 +127,7 @@ def test_starccm(
         install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS,
         number_of_nodes=max(number_of_nodes),
     )
-    test_region = region
-    logging.info(f"test region is {test_region}")
-    cluster = shared_clusters_factory(cluster_config, test_region)
+    cluster = shared_clusters_factory(cluster_config, region, instance, os, scheduler)
     logging.info("Cluster Created")
 
     remote_command_executor = RemoteCommandExecutor(cluster)
@@ -199,9 +197,7 @@ def test_openfoam(
         install_extra_deps=os in number_of_nodes,
         number_of_nodes=max(number_of_nodes),
     )
-    test_region = region
-    logging.info(f"test region is {test_region}")
-    cluster = shared_clusters_factory(cluster_config, test_region)
+    cluster = shared_clusters_factory(cluster_config, region, instance, os, scheduler)
     logging.info("Cluster Created")
     remote_command_executor = RemoteCommandExecutor(cluster)
     if not openfoam_installed(remote_command_executor):

From 9c0dc7d6c5f02dbf86e7897994f08fa60bad2639 Mon Sep 17 00:00:00 2001
From: Xuanqi He <xuanqihe@amazon.com>
Date: Tue, 2 Jul 2024 19:43:21 -0400
Subject: [PATCH 12/14] Use _cluster_factory_wrapper to try to avoid
 AttributeError: Can't pickle local object

---
 tests/integration-tests/conftest.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py
index ff8fe5c955..cfbe9e5c22 100644
--- a/tests/integration-tests/conftest.py
+++ b/tests/integration-tests/conftest.py
@@ -387,6 +387,10 @@ def shared_clusters_factory(request):
         request.session.shared_existing_clusters = {}
         request.session.shared_existing_clusters_started_to_create = set()
 
+    yield from _cluster_factory_wrapper(request, factory)
+
+
+def _cluster_factory_wrapper(request, factory):
     def _cluster_factory(cluster_config, region, instance, os, scheduler, upper_case_cluster_name=False, custom_cli_credentials=None, **kwargs):
         cluster_key = f"{region}-{instance}-{os}-{scheduler}"
         logging.info(

From 8ee614b5a96034ada9ac384775a9714fe9ac04e2 Mon Sep 17 00:00:00 2001
From: Xuanqi He <xuanqihe@amazon.com>
Date: Tue, 2 Jul 2024 20:57:48 -0400
Subject: [PATCH 13/14] Use Encapsulation Classes to try to avoid
 AttributeError: Can't pickle local object

---
 tests/integration-tests/conftest.py | 62 +++++++++++++++++++----------
 1 file changed, 42 insertions(+), 20 deletions(-)

diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py
index cfbe9e5c22..d619bb095c 100644
--- a/tests/integration-tests/conftest.py
+++ b/tests/integration-tests/conftest.py
@@ -369,30 +369,32 @@ def _setup_custom_logger(log_file):
 
 class SharedClusterDetectionTimeoutError(Exception):
     """Custom exception for shared cluster detection timeout."""
-    pass
-
-
-@xdist_session_fixture(autouse=True)
-@pytest.mark.usefixtures("setup_credentials")
-def shared_clusters_factory(request):
-    """
-    Define a fixture to manage the creation and destruction of session shared clusters.
 
-    The configs used to create clusters are dumped to output_dir/clusters_configs/{test_name}.config
-    """
-    factory = ClustersFactory(delete_logs_on_success=request.config.getoption("delete_logs_on_success"))
+    pass
 
-    if not hasattr(request.session, "shared_existing_clusters"):
-        logging.info("Setting shared_existing_clusters_started_to_create and shared_existing_clusters")
-        request.session.shared_existing_clusters = {}
-        request.session.shared_existing_clusters_started_to_create = set()
 
-    yield from _cluster_factory_wrapper(request, factory)
+class ClusterManager:
+    """Cluster Manager for shared cluster fixture to avoid AttributeError: Can't pickle local object"""
 
+    def __init__(self, request, factory):
+        self.request = request
+        self.factory = factory
 
-def _cluster_factory_wrapper(request, factory):
-    def _cluster_factory(cluster_config, region, instance, os, scheduler, upper_case_cluster_name=False, custom_cli_credentials=None, **kwargs):
+    def cluster_factory(
+        self,
+        cluster_config,
+        region,
+        instance,
+        os,
+        scheduler,
+        upper_case_cluster_name=False,
+        custom_cli_credentials=None,
+        **kwargs,
+    ):
+        """Create cluster or use existing cluster."""
         cluster_key = f"{region}-{instance}-{os}-{scheduler}"
+        request = self.request
+        factory = self.factory
         logging.info(
             "Eligible for using shared cluster, start to detect."
             if cluster_key in request.session.shared_existing_clusters_started_to_create
@@ -401,7 +403,9 @@ def _cluster_factory(cluster_config, region, instance, os, scheduler, upper_case
         if cluster_key in request.session.shared_existing_clusters_started_to_create:
             for retry in range(40):
                 if cluster_key in request.session.shared_existing_clusters:
-                    logging.info(f"Shared cluster {request.session.shared_existing_clusters[cluster_key].name} detected.")
+                    logging.info(
+                        f"Shared cluster {request.session.shared_existing_clusters[cluster_key].name} detected."
+                    )
                     return request.session.shared_existing_clusters[cluster_key]
                 else:
                     logging.info(f"Shared cluster not detected yet. Retrying... ({retry + 1}/40)")
@@ -432,7 +436,25 @@ def _cluster_factory(cluster_config, region, instance, os, scheduler, upper_case
         request.session.shared_existing_clusters[cluster_key] = cluster
         return cluster
 
-    yield _cluster_factory
+
+@xdist_session_fixture(autouse=True)
+@pytest.mark.usefixtures("setup_credentials")
+def shared_clusters_factory(request):
+    """
+    Define a fixture to manage the creation and destruction of session shared clusters.
+
+    The configs used to create clusters are dumped to output_dir/clusters_configs/{test_name}.config
+    """
+    factory = ClustersFactory(delete_logs_on_success=request.config.getoption("delete_logs_on_success"))
+
+    if not hasattr(request.session, "shared_existing_clusters"):
+        logging.info("Setting shared_existing_clusters_started_to_create and shared_existing_clusters")
+        request.session.shared_existing_clusters = {}
+        request.session.shared_existing_clusters_started_to_create = set()
+
+    manager = ClusterManager(request, factory)
+
+    yield manager.cluster_factory
 
     if not request.config.getoption("no_delete"):
         try:

From a9eb36628189dc37dee15bc362fb0148de62d471 Mon Sep 17 00:00:00 2001
From: Xuanqi He <xuanqihe@amazon.com>
Date: Tue, 2 Jul 2024 21:11:52 -0400
Subject: [PATCH 14/14] yield ClusterManager instead of funtion in it to try to
 avoid AttributeError: Can't pickle local object

---
 tests/integration-tests/conftest.py                           | 2 +-
 .../tests/performance_tests/test_starccm_and_openfoam.py      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py
index d619bb095c..eb2a27df93 100644
--- a/tests/integration-tests/conftest.py
+++ b/tests/integration-tests/conftest.py
@@ -454,7 +454,7 @@ def shared_clusters_factory(request):
 
     manager = ClusterManager(request, factory)
 
-    yield manager.cluster_factory
+    yield manager
 
     if not request.config.getoption("no_delete"):
         try:
diff --git a/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py
index fb76ecd234..bc0860082c 100644
--- a/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py
+++ b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py
@@ -127,7 +127,7 @@ def test_starccm(
         install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS,
         number_of_nodes=max(number_of_nodes),
     )
-    cluster = shared_clusters_factory(cluster_config, region, instance, os, scheduler)
+    cluster = shared_clusters_factory.cluster_factory(cluster_config, region, instance, os, scheduler)
     logging.info("Cluster Created")
 
     remote_command_executor = RemoteCommandExecutor(cluster)
@@ -197,7 +197,7 @@ def test_openfoam(
         install_extra_deps=os in number_of_nodes,
         number_of_nodes=max(number_of_nodes),
     )
-    cluster = shared_clusters_factory(cluster_config, region, instance, os, scheduler)
+    cluster = shared_clusters_factory.cluster_factory(cluster_config, region, instance, os, scheduler)
     logging.info("Cluster Created")
     remote_command_executor = RemoteCommandExecutor(cluster)
     if not openfoam_installed(remote_command_executor):