diff --git a/.github/workflows/balfrin-ci.yml b/.github/workflows/balfrin-ci.yml new file mode 100644 index 0000000000..0c68ef1b5f --- /dev/null +++ b/.github/workflows/balfrin-ci.yml @@ -0,0 +1,67 @@ +# +# Copyright (c) 2025, ETH Zurich. All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause +# +name: Balfrin-CI + +# Controls when the workflow will run +on: + # Triggers the workflow on push or pull request events but only for the "main" branch + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + # This workflow contains a single job called "test_cluster" + test_cluster: + # The type of runner that the job will run on + runs-on: ubuntu-latest + strategy: + matrix: + system_name: ["balfrin"] + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v3 + + - name: setup python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + + - name: install python packages + run: | + python -m pip install --upgrade pip + pip install -r requirements/ci.txt + + - name: Run testing script + env: + FIRECREST_CLIENT_ID: ${{ secrets.F7T_CLIENT_ID }} + FIRECREST_CLIENT_SECRET: ${{ secrets.F7T_CLIENT_SECRET }} + FIRECREST_URL: ${{ secrets.F7T_URL }} + AUTH_TOKEN_URL: ${{ secrets.F7T_TOKEN_URL }} + SYSTEM_WORKING_DIR: ${{ vars.F7T_SYSTEM_WORKING_DIR }} + run: | + echo "Event name: ${{ github.event_name }}" + echo "Head ref: ${{ github.head_ref }}" + echo "Ref name: ${{ github.ref_name }}" + if [ "${{ github.event_name }}" = "pull_request" ]; then + echo "Using branch: ${{ github.head_ref }}" + BRANCH_NAME="${{ github.head_ref }}" + else + echo "Using branch: ${{ github.ref_name }}" + BRANCH_NAME="${{ github.ref_name }}" + fi + python ci/ci_script.py \ + --system=${{ matrix.system_name }} \ + --branch="$BRANCH_NAME" \ + --repo=${{ github.server_url }}/${{ github.repository }}.git \ + --account=s83 diff --git a/ci/ci_script.py b/ci/ci_script.py new file mode 100644 index 0000000000..4791da6497 --- /dev/null +++ b/ci/ci_script.py @@ -0,0 +1,154 @@ +# +# Copyright (c) 2025, ETH Zurich. All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause +# +import firecrest as fc +import os +import time +import argparse +import utilities as util + +from firecrest import FirecrestException + + +final_slurm_states = { + "BOOT_FAIL", + "CANCELLED", + "COMPLETED", + "DEADLINE", + "FAILED", + "NODE_FAIL", + "OUT_OF_MEMORY", + "PREEMPTED", + "TIMEOUT", +} + + +def select_dict_by_name(name, list_of_dicts, select_key="name"): + res = None + for d in list_of_dicts: + if d[select_key] == name: + res = d + break + + return res + + +def check_mandatory_env_var(env_var): + r = os.environ.get(env_var) + if not r: + exit(1) + + return r + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--system", + default=os.environ.get("FIRECREST_SYSTEM"), + help="choose system to run", + ) + parser.add_argument("--branch", default="main", help="branch to be tested") + parser.add_argument("--account", default="csstaff", help="branch to be tested") + parser.add_argument("--repo", help="repository to be tested") + + args = parser.parse_args() + system_name = args.system + ref = args.branch + print(f"Will try to run the ci in system {system_name} on branch {ref}") + + # Setup variables of the client + CLIENT_ID = check_mandatory_env_var("FIRECREST_CLIENT_ID") + CLIENT_SECRET = check_mandatory_env_var("FIRECREST_CLIENT_SECRET") + FIRECREST_URL = check_mandatory_env_var("FIRECREST_URL") + AUTH_TOKEN_URL = check_mandatory_env_var("AUTH_TOKEN_URL") + SYSTEM_WORKING_DIR = check_mandatory_env_var("SYSTEM_WORKING_DIR") + + keycloak = fc.ClientCredentialsAuth(CLIENT_ID, CLIENT_SECRET, AUTH_TOKEN_URL) + client = fc.v2.Firecrest(firecrest_url=FIRECREST_URL, authorization=keycloak) + + all_systems = client.systems() + system_names = [system["name"] for system in all_systems] + print(f"Available systems: {', '.join(system_names)}") + + script_content = util.create_batch_script( + repo=args.repo, + # num_nodes=2, + # account=args.account, + # custom_modules=["cray", "cray-python"], + branch=ref, + ) + + system_state = select_dict_by_name(system_name, all_systems) + if not system_state: + print(f"System `{system_name}` is not available") + exit(1) + + print(f"System info: {system_state}") + + # scheduler information + scheduler_health_info = select_dict_by_name( + "scheduler", system_state["servicesHealth"], "serviceType" + ) + + if scheduler_health_info["healthy"]: + job = client.submit( + system_name, + working_dir=SYSTEM_WORKING_DIR, + script_str=script_content, + ) + print(f"Submitted job: {job['jobId']}") + while True: + try: + poll_result = client.job_info(system_name, jobid=job["jobId"]) + except FirecrestException as e: + if e.responses[-1].status_code == 404: + print(f"No available information yet for job {job['jobId']}") + time.sleep(2) + continue + + raise e + + print(f"Job status: {poll_result}") + state = poll_result[0]["status"]["state"] + if state in final_slurm_states: + print(f"Job is in final state: {state}") + break + + print(f"Status of the job is {state}, will try again in 10 seconds") + time.sleep(10) + + stdout_file_path = os.path.join(SYSTEM_WORKING_DIR, "job.out") + stderr_file_path = os.path.join(SYSTEM_WORKING_DIR, "job.err") + + print(f"\nSTDOUT in {stdout_file_path}") + stdout_content = client.tail( + system_name, path=stdout_file_path, num_lines=1000 + )["content"] + print(stdout_content) + + print(f"\nSTDERR in {stderr_file_path}") + stderr_content = client.tail( + system_name, path=stderr_file_path, num_lines=1000 + )["content"] + print(stderr_content) + + # Some sanity checks: + if poll_result[0]["status"]["state"] != "COMPLETED": + print( + f"Job was not successful, status: {poll_result[0]['status']['state']}" + ) + exit(1) + + util.check_output(stdout_content) + + else: + print(f"Scheduler of system `{system_name}` is not healthy") + exit(1) + + +if __name__ == "__main__": + main() diff --git a/ci/utilities.py b/ci/utilities.py new file mode 100644 index 0000000000..79cf99e43d --- /dev/null +++ b/ci/utilities.py @@ -0,0 +1,66 @@ +# +# Copyright (c) 2025, ETH Zurich. All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause +# + + +def create_batch_script( + repo, num_nodes=1, account=None, custom_modules=None, branch="main", constraint=None +): + script = f"""#!/bin/bash -l +#SBATCH --job-name="ci_job-spack-c2sm" +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --time=2:00:00 +#SBATCH --partition=postproc +#SBATCH --nodes={num_nodes} +""" + + if constraint: + script += f"#SBATCH --constraint={constraint}\n" + + if account: + script += f"#SBATCH --account={account}\n" + + script += f""" + +# Clone command will fail if the directory already exists +# Remove this first if you are using the same working directory +# every time +if [ -d "firecrest-ci" ]; then + rm -rf firecrest-ci +fi +git clone --depth 1 --shallow-submodules --recurse-submodules -b {branch} {repo} firecrest-ci +cd firecrest-ci + +module use /mch-environment/v8/modules +module load python/3.11.7 +""" + + if custom_modules: + script += f"module load {' '.join(custom_modules)}\n" + + script += """ +python -m venv testing-venv +source ./testing-venv/bin/activate +python -m pip install --upgrade pip +python -m pip install -r requirements/test.txt +deactivate + +python --version + +source ./setup-env.sh +spack spec gnuconfig + +source ./setup-env.sh /mch-environment/v8 +source ./testing-venv/bin/activate +srun pytest -v -n 64 test/common_system_test.py test/balfrin_system_test.py +""" + + return script + + +def check_output(file_content): + assert "loops, best of" in file_content diff --git a/requirements/ci.txt b/requirements/ci.txt new file mode 100644 index 0000000000..9c23307187 --- /dev/null +++ b/requirements/ci.txt @@ -0,0 +1 @@ +pyfirecrest==3.0.0