Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions .github/workflows/balfrin-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#
# Copyright (c) 2025, ETH Zurich. All rights reserved.
#
# Please, refer to the LICENSE file in the root directory.
# SPDX-License-Identifier: BSD-3-Clause
#
name: Balfrin-CI

# Controls when the workflow will run
on:
# Triggers the workflow on push or pull request events but only for the "main" branch
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
# This workflow contains a single job called "test_cluster"
test_cluster:
# The type of runner that the job will run on
runs-on: ubuntu-latest
strategy:
matrix:
system_name: ["balfrin"]

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v3

- name: setup python
uses: actions/setup-python@v4
with:
python-version: '3.8'

- name: install python packages
run: |
python -m pip install --upgrade pip
pip install -r requirements/ci.txt

- name: Run testing script
env:
FIRECREST_CLIENT_ID: ${{ secrets.F7T_CLIENT_ID }}
FIRECREST_CLIENT_SECRET: ${{ secrets.F7T_CLIENT_SECRET }}
FIRECREST_URL: ${{ secrets.F7T_URL }}
AUTH_TOKEN_URL: ${{ secrets.F7T_TOKEN_URL }}
SYSTEM_WORKING_DIR: ${{ vars.F7T_SYSTEM_WORKING_DIR }}
run: |
echo "Event name: ${{ github.event_name }}"
echo "Head ref: ${{ github.head_ref }}"
echo "Ref name: ${{ github.ref_name }}"
if [ "${{ github.event_name }}" = "pull_request" ]; then
echo "Using branch: ${{ github.head_ref }}"
BRANCH_NAME="${{ github.head_ref }}"
else
echo "Using branch: ${{ github.ref_name }}"
BRANCH_NAME="${{ github.ref_name }}"
fi
python ci/ci_script.py \
--system=${{ matrix.system_name }} \
--branch="$BRANCH_NAME" \
--repo=${{ github.server_url }}/${{ github.repository }}.git \
--account=s83
154 changes: 154 additions & 0 deletions ci/ci_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#
# Copyright (c) 2025, ETH Zurich. All rights reserved.
#
# Please, refer to the LICENSE file in the root directory.
# SPDX-License-Identifier: BSD-3-Clause
#
import firecrest as fc
import os
import time
import argparse
import utilities as util

from firecrest import FirecrestException


final_slurm_states = {
"BOOT_FAIL",
"CANCELLED",
"COMPLETED",
"DEADLINE",
"FAILED",
"NODE_FAIL",
"OUT_OF_MEMORY",
"PREEMPTED",
"TIMEOUT",
}


def select_dict_by_name(name, list_of_dicts, select_key="name"):
res = None
for d in list_of_dicts:
if d[select_key] == name:
res = d
break

return res


def check_mandatory_env_var(env_var):
r = os.environ.get(env_var)
if not r:
exit(1)

return r


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--system",
default=os.environ.get("FIRECREST_SYSTEM"),
help="choose system to run",
)
parser.add_argument("--branch", default="main", help="branch to be tested")
parser.add_argument("--account", default="csstaff", help="branch to be tested")
parser.add_argument("--repo", help="repository to be tested")

args = parser.parse_args()
system_name = args.system
ref = args.branch
print(f"Will try to run the ci in system {system_name} on branch {ref}")

# Setup variables of the client
CLIENT_ID = check_mandatory_env_var("FIRECREST_CLIENT_ID")
CLIENT_SECRET = check_mandatory_env_var("FIRECREST_CLIENT_SECRET")
FIRECREST_URL = check_mandatory_env_var("FIRECREST_URL")
AUTH_TOKEN_URL = check_mandatory_env_var("AUTH_TOKEN_URL")
SYSTEM_WORKING_DIR = check_mandatory_env_var("SYSTEM_WORKING_DIR")

keycloak = fc.ClientCredentialsAuth(CLIENT_ID, CLIENT_SECRET, AUTH_TOKEN_URL)
client = fc.v2.Firecrest(firecrest_url=FIRECREST_URL, authorization=keycloak)

all_systems = client.systems()
system_names = [system["name"] for system in all_systems]
print(f"Available systems: {', '.join(system_names)}")

script_content = util.create_batch_script(
repo=args.repo,
# num_nodes=2,
# account=args.account,
# custom_modules=["cray", "cray-python"],
branch=ref,
)

system_state = select_dict_by_name(system_name, all_systems)
if not system_state:
print(f"System `{system_name}` is not available")
exit(1)

print(f"System info: {system_state}")

# scheduler information
scheduler_health_info = select_dict_by_name(
"scheduler", system_state["servicesHealth"], "serviceType"
)

if scheduler_health_info["healthy"]:
job = client.submit(
system_name,
working_dir=SYSTEM_WORKING_DIR,
script_str=script_content,
)
print(f"Submitted job: {job['jobId']}")
while True:
try:
poll_result = client.job_info(system_name, jobid=job["jobId"])
except FirecrestException as e:
if e.responses[-1].status_code == 404:
print(f"No available information yet for job {job['jobId']}")
time.sleep(2)
continue

raise e

print(f"Job status: {poll_result}")
state = poll_result[0]["status"]["state"]
if state in final_slurm_states:
print(f"Job is in final state: {state}")
break

print(f"Status of the job is {state}, will try again in 10 seconds")
time.sleep(10)

stdout_file_path = os.path.join(SYSTEM_WORKING_DIR, "job.out")
stderr_file_path = os.path.join(SYSTEM_WORKING_DIR, "job.err")

print(f"\nSTDOUT in {stdout_file_path}")
stdout_content = client.tail(
system_name, path=stdout_file_path, num_lines=1000
)["content"]
print(stdout_content)

print(f"\nSTDERR in {stderr_file_path}")
stderr_content = client.tail(
system_name, path=stderr_file_path, num_lines=1000
)["content"]
print(stderr_content)

# Some sanity checks:
if poll_result[0]["status"]["state"] != "COMPLETED":
print(
f"Job was not successful, status: {poll_result[0]['status']['state']}"
)
exit(1)

util.check_output(stdout_content)

else:
print(f"Scheduler of system `{system_name}` is not healthy")
exit(1)


if __name__ == "__main__":
main()
66 changes: 66 additions & 0 deletions ci/utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#
# Copyright (c) 2025, ETH Zurich. All rights reserved.
#
# Please, refer to the LICENSE file in the root directory.
# SPDX-License-Identifier: BSD-3-Clause
#


def create_batch_script(
repo, num_nodes=1, account=None, custom_modules=None, branch="main", constraint=None
):
script = f"""#!/bin/bash -l
#SBATCH --job-name="ci_job-spack-c2sm"
#SBATCH --output=job.out
#SBATCH --error=job.err
#SBATCH --time=2:00:00
#SBATCH --partition=postproc
#SBATCH --nodes={num_nodes}
"""

if constraint:
script += f"#SBATCH --constraint={constraint}\n"

if account:
script += f"#SBATCH --account={account}\n"

script += f"""

# Clone command will fail if the directory already exists
# Remove this first if you are using the same working directory
# every time
if [ -d "firecrest-ci" ]; then
rm -rf firecrest-ci
fi
git clone --depth 1 --shallow-submodules --recurse-submodules -b {branch} {repo} firecrest-ci
cd firecrest-ci

module use /mch-environment/v8/modules
module load python/3.11.7
"""

if custom_modules:
script += f"module load {' '.join(custom_modules)}\n"

script += """
python -m venv testing-venv
source ./testing-venv/bin/activate
python -m pip install --upgrade pip
python -m pip install -r requirements/test.txt
deactivate

python --version

source ./setup-env.sh
spack spec gnuconfig

source ./setup-env.sh /mch-environment/v8
source ./testing-venv/bin/activate
srun pytest -v -n 64 test/common_system_test.py test/balfrin_system_test.py
"""

return script


def check_output(file_content):
assert "loops, best of" in file_content
1 change: 1 addition & 0 deletions requirements/ci.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pyfirecrest==3.0.0
Loading