diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 7605ce8a..fa7241bf 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + name: Documentation on: @@ -8,7 +10,7 @@ on: workflow_dispatch: jobs: - build: + build-docs: runs-on: ubuntu-latest steps: - @@ -22,8 +24,7 @@ jobs: - name: Install package and documentation dependencies run: | - pip install . - pip install -r docs/requirements.txt + pip install ".[docs]" - name: Build documentation run: | diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml new file mode 100644 index 00000000..5f9ad47b --- /dev/null +++ b/.github/workflows/lint.yaml @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Lint + +on: + push: + branches: [ develop, main ] + pull_request: + branches: [ develop, main ] + workflow_dispatch: + +jobs: + lint-check: + runs-on: ubuntu-latest + steps: + - + name: Checkout repository + uses: actions/checkout@v4 + - + name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + cache: 'pip' + - + name: Install ruff + run: pip install ruff==0.15.4 + - + name: Run ruff check + run: ruff check + - + name: Run ruff format check + run: ruff format --check --diff . diff --git a/.github/workflows/docker-slurm.yaml b/.github/workflows/test-suite.yaml similarity index 98% rename from .github/workflows/docker-slurm.yaml rename to .github/workflows/test-suite.yaml index aa906590..c7cb134b 100644 --- a/.github/workflows/docker-slurm.yaml +++ b/.github/workflows/test-suite.yaml @@ -1,4 +1,6 @@ -name: ChiltepinTests +# SPDX-License-Identifier: Apache-2.0 + +name: TestSuite env: LATEST_PYTHON_VERSION: '3.13' @@ -11,19 +13,6 @@ on: workflow_dispatch: jobs: - lint-check: - runs-on: ubuntu-latest - steps: - - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Format Check - run: | - pip install ruff - ruff check - ruff format --check --diff . - test-chiltepin-amd64: runs-on: ubuntu2204-16c-64g-600ssd timeout-minutes: 360 diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 043f3e50..0fe1c6a9 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 # Read the Docs configuration file for Sphinx projects # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details @@ -32,6 +33,7 @@ sphinx: # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html python: install: - - requirements: docs/requirements.txt - method: pip path: . + extra_requirements: + - docs diff --git a/LICENSE b/LICENSE index 66e0f8a1..3a0353a7 100644 --- a/LICENSE +++ b/LICENSE @@ -1,7 +1,203 @@ -Software code created by U.S. Government employees is not subject to copyright -in the United States (17 U.S.C. Β§105). The United States/Department of Commerce -reserve all rights to seek and obtain copyright protection in countries other -than the United States for Software authored in its entirety by the Department of -Commerce. To this end, the Department of Commerce hereby grants to Recipient a -royalty-free, nonexclusive license to use, copy, and create derivative works of -the Software outside of the United States. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2024-2026 The Regents of the University of Colorado and + contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 1c4c4e6f..197c366d 100644 --- a/README.md +++ b/README.md @@ -1,113 +1,51 @@ -[![ExascaleSandboxTests](https://github.com/NOAA-GSL/ExascaleWorkflowSandbox/actions/workflows/docker-slurm.yaml/badge.svg)](https://github.com/NOAA-GSL/ExascaleWorkflowSandbox/actions/workflows/docker-slurm.yaml) +[![ExascaleSandboxTests](https://github.com/NOAA-GSL/ExascaleWorkflowSandbox/actions/workflows/test-suite.yaml/badge.svg)](https://github.com/NOAA-GSL/ExascaleWorkflowSandbox/actions/workflows/test-suite.yaml) [![Documentation](https://github.com/NOAA-GSL/ExascaleWorkflowSandbox/actions/workflows/docs.yaml/badge.svg)](https://github.com/NOAA-GSL/ExascaleWorkflowSandbox/actions/workflows/docs.yaml) -``` -This repository is a scientific product and is not official communication -of the National Oceanic and Atmospheric Administration, or the United States -Department of Commerce. All NOAA GitHub project code is provided on an β€˜as -is’ basis and the user assumes responsibility for its use. Any claims against -the Department of Commerce or Department of Commerce bureaus stemming from -the use of this GitHub project will be governed by all applicable Federal -law. Any reference to specific commercial products, processes, or service -by service mark, trademark, manufacturer, or otherwise, does not constitute -or imply their endorsement, recommendation or favoring by the Department of -Commerce. The Department of Commerce seal and logo, or the seal and logo of -a DOC bureau, shall not be used in any manner to imply endorsement of any -commercial product or activity by DOC or the United States Government. -``` +# Chiltepin -# Overview +## Overview This repository is a collection of tools and demonstrations used to explore and test various technologies for implementing exascale scientific workflows. This collection of resources is not intended for production use, and is for research purposes only. -# Installation - -This software can be installed on Linux systems. MacOS is not currently -supported. It can be used, however, on Macs in a container. See below for -instructions for building and using the Docker container. - -The recommended method for installation is to use a Python venv. - -``` -python -m venv .chiltepin -source .chiltepin/bin/activate -pip install -e .[test] -``` - -Alternatively, a conda environment (anaconda3, miniconda3, miniforge, etc.) -can be used. - -``` -conda create -n "chiltepin" python=3.10 -source activate chiltepin -pip install -e .[test] -``` - -NOTE: The `[test]` ensures that dependencies required for running the tests are installed. +Chiltepin provides Python decorators and utilities for building scientific workflows +that can execute on distributed computing resources using [Parsl](https://parsl-project.org/) +and [Globus](https://www.globus.org/) services. -Once installed, Chiltepin can be used simply by activating the environment using -the command appropriate for your environment type (venv, conda, etc). +## Documentation -# Running the test suite +**πŸ“š Full documentation is available at [Read the Docs](https://exascaleworkflowsandbox.readthedocs.io/)** -The test suite is run with `pytest` and requires an editable installation of the Chiltepin -repository (achieved using the `pip install -e .` installation step from above) +Key documentation sections: +- [Installation Guide](https://exascaleworkflowsandbox.readthedocs.io/en/latest/installation.html) - Installing Chiltepin on Linux and via Docker on macOS and Windows +- [Quick Start](https://exascaleworkflowsandbox.readthedocs.io/en/latest/quickstart.html) - Your first Chiltepin workflow +- [Tasks](https://exascaleworkflowsandbox.readthedocs.io/en/latest/tasks.html) - Python, Bash, and Join task decorators +- [Configuration](https://exascaleworkflowsandbox.readthedocs.io/en/latest/configuration.html) - Configuring compute resources +- [Endpoints](https://exascaleworkflowsandbox.readthedocs.io/en/latest/endpoints.html) - Managing Globus Compute endpoints +- [Data Transfer](https://exascaleworkflowsandbox.readthedocs.io/en/latest/data.html) - Using Globus for data movement +- [Testing Guide](https://exascaleworkflowsandbox.readthedocs.io/en/latest/testing.html) - Running the test suite -An additional step is required for successful completion of the Globus Compute tests. These -tests require users to authenticate to globus before running the pytest command. This is done -with the `globus-compute-endpoint login` command. +## Quick Start -``` -globus-compute-endpoint login -pytest --assert=plain --config=tests/configs/.yaml -``` +Install Chiltepin in a Python virtual environment: -Where `` is the specific platform where you are running the tests: - -1. `docker` # Platform used for the container -2. `hercules` -3. `hera` -4. `ursa` - -For more detailed information during testing -``` -pytest -s -vvv --assert=plain --config=tests/configs/.yaml -``` - -# Building and running the Chiltepin container - -Chiltepin provides a Docker container environment for building and running Parsl and Chiltepin -applications. It makes use of docker compose to build a multi-node Slurm cluster for use as a -backend for running the applications. This repository is mounted from the host into the container's -chiltepin directory. - -To build the container: - -``` -cd docker -docker compose -f docker-compose.yml up -d +```bash +python -m venv .chiltepin +source .chiltepin/bin/activate +pip install -e . ``` -To use the container after it is built and up, log in with a bash shell: +For detailed installation instructions including conda, Docker, and platform-specific guidance, +see the [Installation Guide](https://exascaleworkflowsandbox.readthedocs.io/en/latest/installation.html). -``` -docker exec -it frontend bash -l -``` +## Contributing -Once in the container, you can install Chiltepin in editable mode (using the pip from the -container environment), and run the tests +Contributions are welcome! For information on running tests and contributing to development, +see the [Testing Guide](https://exascaleworkflowsandbox.readthedocs.io/en/latest/testing.html). -``` -cd chiltepin -pip install -e .[test] -pytest --assert=plain --config=tests/configs/docker.yaml -``` +## License -NOTE: the `[test]` ensures that dependencies required for running the tests are installed. +See [LICENSE](LICENSE) for details. -NOTE: Depending on how many cores your machine has and how many you've allocated to Docker, -you may need to modify the `cores per node` setting in the configuration yaml file to match -your machine's specifications to get all tests to pass. diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 33a91b9b..a0fbd7ed 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + services: slurmfrontend: image: ghcr.io/noaa-gsl/dockerspackstackslurmcluster/frontend:latest diff --git a/docs/conf.py b/docs/conf.py index 825efbca..756f697b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 # Configuration file for the Sphinx documentation builder. from importlib.metadata import PackageNotFoundError @@ -6,7 +7,7 @@ # -- Project information project = "Chiltepin" -copyright = "2024-2026, Christopher W Harrop" +copyright = "2024-2026, The Regents of the University of Colorado and contributors" author = "Christopher W Harrop" # Get version from the installed package diff --git a/docs/configuration.rst b/docs/configuration.rst new file mode 100644 index 00000000..302dcd79 --- /dev/null +++ b/docs/configuration.rst @@ -0,0 +1,452 @@ +Configuration +============= + +Chiltepin uses YAML configuration files to specify a collection of resources for use +during workflow execution. Each resource in the configuration describes a pool of computational +resources to which tasks can be submitted for execution. + +Overview +-------- + +Resources can be: + +- **Local**: Your laptop or workstation +- **HPC systems**: Accessed through job schedulers like Slurm or PBS Pro +- **Remote systems**: Accessed through Globus Compute endpoints + +The configuration file defines named resources, where each resource represents a pool +of nodes and/or cores on which tasks can be executed. + +Basic Structure +--------------- + +A Chiltepin configuration file contains one or more named resource definitions: + +.. code-block:: yaml + + resource-name-1: + provider: "slurm" + partition: "compute" + cores_per_node: 4 + nodes_per_block: 2 + # ... additional options + + resource-name-2: + endpoint: "uuid-of-globus-compute-endpoint" + mpi: True + max_mpi_apps: 4 + # ... additional options + +Understanding Resource Configuration +------------------------------------ + +Three key options determine how resources are accessed and used: + +Provider: How Resources Are Acquired +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``provider`` option specifies how computational resources for running your workflow are +obtained: + +- ``"localhost"`` - Use CPU resources on the current machine +- ``"slurm"`` - Obtain a pool of resources via a Slurm scheduler pilot job +- ``"pbspro"`` - Obtain a pool of resources via a PBS Pro scheduler pilot job + +When using HPC providers (Slurm or PBS Pro), you can specify scheduler-specific +options like partition, queue, account, and walltime. + +Endpoint: Remote Resource Access +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``endpoint`` option specifies a Globus Compute endpoint UUID for accessing +remote resources: + +.. code-block:: yaml + + remote-hpc: + endpoint: "12345678-1234-1234-1234-123456789abc" + mpi: True + provider: "slurm" + partition: "gpu" + +When an endpoint is specified, tasks are sent to the remote system via Globus Compute. +All other configuration options (provider, mpi, etc.) are passed to the endpoint's +configuration template (created automatically by Chiltepin when an endpoint is configured) +to describe the resource pool on the remote system. + +MPI: Support for Parallel Applications +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``mpi`` option indicates whether the resource pool supports MPI (Message Passing Interface) +applications: + +.. code-block:: yaml + + mpi-resource: + mpi: True + max_mpi_apps: 4 + provider: "slurm" + nodes_per_block: 8 + +When ``mpi: True``, the resource is configured to run parallel MPI applications. +You can control how many concurrent MPI applications can run with ``max_mpi_apps``. + +Resource Types +-------------- + +Based on the configuration options, Chiltepin automatically determines the resource type: + +**Remote Resources** + Use Globus Compute to access remote systems. Specified by providing an ``endpoint`` UUID. + +**MPI Resources** + Run parallel MPI applications on HPC systems (local or remote). Specified by setting + ``mpi: True``. + +**High-Throughput Resources** + Run many independent tasks concurrently. This is the default when ``mpi`` is not specified + (or is set to ``False``), whether the resources are local or remote. + +Configuration Options +--------------------- + +Common Options +^^^^^^^^^^^^^^ + +These options apply to all resource types: + +.. list-table:: + :header-rows: 1 + :widths: 20 15 15 50 + + * - Option + - Type + - Default + - Description + * - ``mpi`` + - boolean + - ``False`` + - Enable MPI support for parallel applications + * - ``provider`` + - string + - ``"localhost"`` + - How to acquire resources: ``"localhost"``, ``"slurm"``, or ``"pbspro"`` + * - ``init_blocks`` + - integer + - ``0`` + - Number of resource blocks to provision at startup + * - ``min_blocks`` + - integer + - ``0`` + - Minimum number of resource blocks to maintain + * - ``max_blocks`` + - integer + - ``1`` + - Maximum number of resource blocks allowed + * - ``environment`` + - list + - ``[]`` + - Shell commands to run before executing tasks (e.g., module loads) + +MPI-Specific Options +^^^^^^^^^^^^^^^^^^^^ + +When ``mpi: True``: + +.. list-table:: + :header-rows: 1 + :widths: 20 15 15 50 + + * - Option + - Type + - Default + - Description + * - ``max_mpi_apps`` + - integer + - ``1`` + - Maximum number of concurrent MPI applications + * - ``mpi_launcher`` + - string + - ``"srun"`` (Slurm) or ``"mpiexec"`` + - MPI launcher command to use + +HPC Provider Options +^^^^^^^^^^^^^^^^^^^^ + +When ``provider`` is ``"slurm"`` or ``"pbspro"``: + +.. list-table:: + :header-rows: 1 + :widths: 20 15 15 50 + + * - Option + - Type + - Default + - Description + * - ``cores_per_node`` + - integer + - ``1`` + - Number of cores per compute node (ignored for MPI resources) + * - ``nodes_per_block`` + - integer + - ``1`` + - Number of nodes per block/job + * - ``exclusive`` + - boolean + - ``True`` + - Request exclusive node allocation (Slurm only) + * - ``partition`` + - string + - None + - Scheduler partition to use (Slurm only) + * - ``queue`` + - string + - None + - QOS (Slurm) or queue name (PBS Pro) + * - ``account`` + - string + - None + - Account/project to charge for resources + * - ``walltime`` + - string + - ``"00:10:00"`` + - Maximum walltime for jobs (HH:MM:SS) + +High-Throughput Resource Options +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For non-MPI resources: + +.. list-table:: + :header-rows: 1 + :widths: 20 15 15 50 + + * - Option + - Type + - Default + - Description + * - ``cores_per_worker`` + - integer + - ``1`` + - Number of cores per worker process + * - ``max_workers_per_node`` + - integer + - Auto + - Maximum workers per node (auto-calculated if not specified) + +Remote Resource Options +^^^^^^^^^^^^^^^^^^^^^^^ + +When ``endpoint`` is specified: + +.. list-table:: + :header-rows: 1 + :widths: 20 15 15 50 + + * - Option + - Type + - Default + - Description + * - ``endpoint`` + - string + - **Required** + - UUID of the Globus Compute endpoint + +.. note:: + All other options (provider, mpi, cores_per_node, etc.) are passed to the endpoint's + configuration template that Chiltepin creates automatically when endpoints are configured. + +Example Configurations +---------------------- + +Local Execution +^^^^^^^^^^^^^^^ + +Simple local resource for testing: + +.. code-block:: yaml + + local: + provider: "localhost" + init_blocks: 1 + max_blocks: 1 + +Slurm HPC System +^^^^^^^^^^^^^^^^ + +Single-node computation: + +.. code-block:: yaml + + compute: + provider: "slurm" + cores_per_node: 128 + nodes_per_block: 1 + partition: "compute" + account: "myproject" + walltime: "01:00:00" + environment: + - "module load python/3.11" + - "module load gcc/11.2" + +Multi-node MPI: + +.. code-block:: yaml + + mpi: + mpi: True + max_mpi_apps: 2 + mpi_launcher: "srun" + provider: "slurm" + cores_per_node: 128 + nodes_per_block: 4 + exclusive: True + partition: "compute" + account: "myproject" + walltime: "02:00:00" + environment: + - "module load openmpi/4.1" + - "export MPIF90=$MPIF90" + +PBS Pro System +^^^^^^^^^^^^^^ + +.. code-block:: yaml + + pbs-compute: + provider: "pbspro" + cores_per_node: 36 + nodes_per_block: 2 + queue: "normal" + account: "MYACCT123" + walltime: "00:30:00" + environment: + - "module load intel/2021" + +Remote Resource via Globus Compute +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: yaml + + remote-mpi: + endpoint: "12345678-1234-1234-1234-123456789abc" + mpi: True + max_mpi_apps: 4 + provider: "slurm" + cores_per_node: 128 + nodes_per_block: 8 + partition: "gpu" + account: "myproject" + walltime: "04:00:00" + environment: + - "module load cuda/11.8" + - "module load openmpi/4.1-cuda" + +Multiple Resources +^^^^^^^^^^^^^^^^^^ + +Combine multiple resource types in one file: + +.. code-block:: yaml + + # Local service tasks + service: + provider: "localhost" + max_blocks: 1 + max_workers_per_node: 3 + + # Local HPC compute tasks + compute: + provider: "slurm" + cores_per_node: 64 + nodes_per_block: 10 + partition: "standard" + account: "myproject" + walltime: "01:00:00" + + # Remote MPI tasks via Globus Compute + remote-mpi: + endpoint: "12345678-1234-1234-1234-123456789abc" + mpi: True + max_mpi_apps: 2 + provider: "slurm" + partition: "standard" + account: "myproject" + nodes_per_block: 16 + +Environment Configuration +^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``environment`` option accepts a list of shell commands that are executed before running +tasks. This is commonly used for: + +- Loading environment modules +- Setting environment variables +- Activating virtual environments +- Exporting paths + +.. code-block:: yaml + + resource-name: + environment: + - "module purge" + - "module load python/3.11 gcc/11.2 openmpi/4.1" + - "export MY_VAR=value" + - "source /path/to/venv/bin/activate" + +.. tip:: + Use YAML anchors to share common environment setup across multiple resources: + + .. code-block:: yaml + + common_env: &common_env + - "module load python/3.11" + - "export PYTHONPATH=/my/path:$PYTHONPATH" + + resource1: + environment: *common_env + + resource2: + environment: *common_env + +Loading Configurations +---------------------- + +Parse and Load +^^^^^^^^^^^^^^ + +.. code-block:: python + + import chiltepin.configure + import parsl + + # Parse YAML configuration file + config_dict = chiltepin.configure.parse_file("my_config.yaml") + + # Create Parsl configuration + parsl_config = chiltepin.configure.load( + config_dict, + include=["compute", "mpi"], # Only load specific resources + run_dir="./runinfo" # Directory for Parsl runtime files + ) + + # Initialize Parsl with configuration + parsl.load(parsl_config) + +The ``include`` parameter lets you selectively load only specific resources from your +configuration file. If omitted, all resources are loaded. + +Configuration Best Practices +----------------------------- + +1. **Start Small**: Begin with short walltimes and small resource requests while testing +2. **Use Anchors**: Share common configuration blocks (like ``environment``) using YAML anchors +3. **Resource Limits**: Set appropriate ``min_blocks`` and ``max_blocks`` to control scaling +4. **Environment Modules**: Always include necessary module loads in the ``environment`` section + +See Also +-------- + +- :doc:`quickstart` - Complete workflow example +- :doc:`endpoints` - Managing Globus Compute endpoints +- :doc:`api` - Python API reference diff --git a/docs/container.rst b/docs/container.rst index 53319c56..5d514008 100644 --- a/docs/container.rst +++ b/docs/container.rst @@ -1,8 +1,8 @@ Docker Container ================ -Chiltepin provides a Docker container environment for building and running Parsl -and Chiltepin applications. The container uses Docker Compose to build a multi-node +Chiltepin provides a Docker container environment for building and running +Chiltepin applications. The container uses Docker Compose to build a multi-node Slurm cluster that serves as a backend for running workflow applications. This is particularly useful for: @@ -88,7 +88,8 @@ The Docker environment consists of: * **Frontend node**: Where you interact with the system and submit jobs * **Compute nodes**: Multiple Slurm compute nodes for running jobs -* **Shared filesystem**: The repository is mounted from the host +* **Master node**: The Slurm controller that manages job scheduling and resource allocation +* **Shared volume**: A shared directory for data and code accessible by all nodes This simulates a real HPC cluster environment with job scheduling and multi-node execution. @@ -116,8 +117,8 @@ Troubleshooting Check that Docker has sufficient resources allocated (CPU, memory, disk space). **Tests fail with resource errors:** - Reduce the ``cores_per_node`` or ``nodes_per_block`` values in the Docker - configuration file. + Reduce the ``cores_per_node`` values in the Docker configuration file. **Cannot access mounted repository:** - Ensure Docker has permission to access the repository directory on your host system. + Ensure Docker has permission to access the repository directory on your host system. If not, + clone the repository directly inside the container and install Chiltepin there. diff --git a/docs/data.rst b/docs/data.rst new file mode 100644 index 00000000..60c7e33e --- /dev/null +++ b/docs/data.rst @@ -0,0 +1,534 @@ +Data Transfer and Management +============================ + +Chiltepin provides specialized tasks for transferring and deleting data between Globus +Transfer endpoints. These tasks integrate seamlessly with Chiltepin workflows, allowing +you to stage data, process it, and clean up afterward. + +.. important:: + **Transfer Endpoints vs Compute Endpoints**: Globus has two types of endpoints: + + - **Transfer Endpoints**: Used for moving and managing files (documented here) + - **Compute Endpoints**: Used for executing tasks (see :doc:`endpoints`) + + These are configured and managed separately through the Globus service. + +Overview +-------- + +The data module provides two Chiltepin tasks for workflow data management: + +- **transfer_task**: Transfer files/directories between Globus Transfer endpoints +- **delete_task**: Delete files/directories from Globus Transfer endpoints + +These are standard Chiltepin tasks that return futures. You can create dependencies by: + +- Using the ``inputs`` parameter to pass a list of futures (non-blocking) +- Passing futures as function arguments (when the task signature supports it) +- Calling ``.result()`` to wait synchronously (blocking) + +Data Transfer Task +------------------ + +The ``transfer_task`` function transfers data between two Globus Transfer endpoints. + +Basic Usage +^^^^^^^^^^^ + +.. code-block:: python + + from chiltepin.data import transfer_task + + # Transfer a single file + transfer_future = transfer_task( + src_ep="my-laptop", + dst_ep="hpc-scratch", + src_path="/Users/me/data/input.dat", + dst_path="/scratch/project/input.dat", + executor="local" + ) + + # Wait for transfer to complete + success = transfer_future.result() + if success: + print("Transfer completed successfully") + +Parameters +^^^^^^^^^^ + +.. list-table:: + :header-rows: 1 + :widths: 20 15 15 50 + + * - Parameter + - Type + - Default + - Description + * - ``src_ep`` + - string + - **Required** + - Source Transfer endpoint name or UUID + * - ``dst_ep`` + - string + - **Required** + - Destination Transfer endpoint name or UUID + * - ``src_path`` + - string + - **Required** + - Path to file/directory on source Transfer endpoint + * - ``dst_path`` + - string + - **Required** + - Path to file/directory on destination Transfer endpoint + * - ``timeout`` + - integer + - ``3600`` + - Seconds to wait for transfer completion + * - ``polling_interval`` + - integer + - ``30`` + - Seconds between status checks + * - ``client`` + - TransferClient + - ``None`` + - Globus TransferClient (auto-created if None) + * - ``recursive`` + - boolean + - ``False`` + - Transfer directories recursively + * - ``executor`` + - string + - **Required** + - Resource name for running the transfer task (must have internet access) + +Recursive Transfer +^^^^^^^^^^^^^^^^^^ + +Transfer entire directories recursively: + +.. code-block:: python + + # Transfer a directory and all its contents + transfer_future = transfer_task( + src_ep="my-laptop", + dst_ep="hpc-scratch", + src_path="/Users/me/project/data/", + dst_path="/scratch/project/data/", + recursive=True, + executor="local" + ) + +Endpoint Names vs UUIDs +^^^^^^^^^^^^^^^^^^^^^^^ + +You can specify endpoints by their display name or UUID: + +.. code-block:: python + + # Using display names + transfer_task( + src_ep="My Laptop", + dst_ep="HPC Scratch Space", + ... + ) + + # Using UUIDs + transfer_task( + src_ep="12345678-1234-1234-1234-123456789abc", + dst_ep="87654321-4321-4321-4321-cba987654321", + ... + ) + +.. tip:: + UUIDs are more reliable than display names, which can change. Find your endpoint + UUIDs at `app.globus.org `_. + +Data Deletion Task +------------------ + +The ``delete_task`` function removes files or directories from a Globus Transfer endpoint. + +Basic Usage +^^^^^^^^^^^ + +.. code-block:: python + + from chiltepin.data import delete_task + + # Delete a single file + delete_future = delete_task( + src_ep="hpc-scratch", + src_path="/scratch/project/temp.dat", + executor="local" + ) + + # Wait for deletion to complete + success = delete_future.result() + if success: + print("File deleted successfully") + +Parameters +^^^^^^^^^^ + +.. list-table:: + :header-rows: 1 + :widths: 20 15 15 50 + + * - Parameter + - Type + - Default + - Description + * - ``src_ep`` + - string + - **Required** + - Transfer endpoint name or UUID where data will be deleted + * - ``src_path`` + - string + - **Required** + - Path to file/directory to delete + * - ``timeout`` + - integer + - ``3600`` + - Seconds to wait for deletion completion + * - ``polling_interval`` + - integer + - ``30`` + - Seconds between status checks + * - ``client`` + - TransferClient + - ``None`` + - Globus TransferClient (auto-created if None) + * - ``recursive`` + - boolean + - ``False`` + - Delete directories recursively + * - ``executor`` + - string + - **Required** + - Resource name for running the deletion task (must have internet access) + +Recursive Deletion +^^^^^^^^^^^^^^^^^^ + +Delete entire directories: + +.. code-block:: python + + # Delete a directory and all its contents + delete_future = delete_task( + src_ep="hpc-scratch", + src_path="/scratch/project/temp_data/", + recursive=True, + executor="local" + ) + +.. warning:: + Recursive deletion is permanent and cannot be undone. Use with caution. + +Workflow Integration +-------------------- + +Transfer and deletion tasks integrate seamlessly with Chiltepin workflows using the +``inputs`` parameter for non-blocking dependencies. + +Stage, Process, Cleanup Pattern +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A common pattern is to stage data, process it, then clean up: + +.. code-block:: python + + import parsl + import chiltepin.configure + from chiltepin.tasks import python_task + from chiltepin.data import transfer_task, delete_task + + @python_task + def analyze_data(input_path): + # Process the data file + import pandas as pd + df = pd.read_csv(input_path) + result = df.mean().to_dict() + return result + + # Load configuration and start Parsl + config_dict = chiltepin.configure.parse_file("config.yaml") + parsl_config = chiltepin.configure.load(config_dict) + + with parsl.load(parsl_config): + # Stage data to compute resource + stage_in = transfer_task( + src_ep="my-laptop", + dst_ep="hpc-scratch", + src_path="/Users/me/data/dataset.csv", + dst_path="/scratch/project/dataset.csv", + executor="local" + ) + + # Process the staged data (waits for transfer via inputs) + analysis = analyze_data( + "/scratch/project/dataset.csv", + executor="compute", + inputs=[stage_in] # Non-blocking dependency + ) + + # Clean up staged data (waits for processing via inputs) + cleanup = delete_task( + src_ep="hpc-scratch", + src_path="/scratch/project/dataset.csv", + executor="local", + inputs=[analysis] # Non-blocking dependency + ) + + # Get results (blocks until analysis completes) + results = analysis.result() + print(f"Analysis results: {results}") + + # Ensure cleanup completes before exiting + cleanup.result() + +Multiple File Transfers +^^^^^^^^^^^^^^^^^^^^^^^^ + +Transfer multiple files in parallel: + +.. code-block:: python + + from chiltepin.data import transfer_task + + # Transfer multiple input files in parallel + files = ["sim1.dat", "sim2.dat", "sim3.dat"] + + transfers = [] + for filename in files: + future = transfer_task( + src_ep="my-laptop", + dst_ep="hpc-scratch", + src_path=f"/Users/me/data/{filename}", + dst_path=f"/scratch/project/{filename}", + executor="local" + ) + transfers.append(future) + + # Wait for all transfers to complete + for t in transfers: + assert t.result(), "Transfer failed" + +Waiting for Multiple Tasks +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To run a transfer or deletion after multiple tasks complete, pass them via the +``inputs`` parameter: + +.. code-block:: python + + from chiltepin.data import transfer_task, delete_task + from chiltepin.tasks import python_task + + @python_task + def generate_config(): + # Generate config file + with open("/scratch/config.json", "w") as f: + f.write('{"param": 1}') + return True + + @python_task + def generate_input(): + # Generate input file + with open("/scratch/input.dat", "w") as f: + f.write("data") + return True + + # Generate files in parallel + config_ready = generate_config(executor="compute") + input_ready = generate_input(executor="compute") + + # Wait for both files before transferring (non-blocking dependency) + transfer = transfer_task( + src_ep="hpc-scratch", + dst_ep="my-laptop", + src_path="/scratch/", + dst_path="/results/", + recursive=True, + executor="local", + inputs=[config_ready, input_ready] # Multiple dependencies + ) + + transfer.result() # Wait for transfer + +Data Pipeline with Transfers +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Build complete data pipelines using the ``inputs`` parameter for non-blocking +dependencies: + +.. code-block:: python + + @python_task + def preprocess(input_path, output_path): + # Preprocessing step + import pandas as pd + df = pd.read_csv(input_path) + df_clean = df.dropna() + df_clean.to_csv(output_path, index=False) + return output_path + + @python_task + def analyze(clean_data_path): + # Analysis step + import pandas as pd + df = pd.read_csv(clean_data_path) + return df.describe().to_dict() + + # Stage raw data + stage_raw = transfer_task( + src_ep="my-laptop", + dst_ep="hpc-scratch", + src_path="/data/raw.csv", + dst_path="/scratch/raw.csv", + executor="local" + ) + + # Preprocess (waits for transfer via inputs) + preprocess_future = preprocess( + "/scratch/raw.csv", + "/scratch/clean.csv", + executor="compute", + inputs=[stage_raw] # Wait for transfer non-blocking + ) + + # Analyze (waits for preprocess completing and passing output path) + analysis_future = analyze( + preprocess_future, # Parsl waits for this future and passes the output path + executor="compute" + ) + + # Stage results back (waits for analysis via inputs) + stage_out = transfer_task( + src_ep="hpc-scratch", + dst_ep="my-laptop", + src_path="/scratch/clean.csv", + dst_path="/data/cleaned_output.csv", + executor="local", + inputs=[analysis_future] # Wait for analysis non-blocking + ) + + # Cleanup remote files (waits for stage_out via inputs) + cleanup = delete_task( + src_ep="hpc-scratch", + src_path="/scratch/", + recursive=True, + executor="local", + inputs=[stage_out] # Wait for stage out non-blocking + ) + + # Get results when needed + results = analysis_future.result() + print(f"Analysis results: {results}") + + # Ensure cleanup completes before exiting + cleanup.result() + +Authentication +-------------- + +Data transfer tasks require Globus authentication. Use the Chiltepin login command: + +.. code-block:: bash + + $ chiltepin login + +This authenticates you with Globus and grants the necessary permissions for data +transfers. The authentication persists across workflow runs until you log out. + +.. note:: + You only need to authenticate once. The credentials are cached and reused for + subsequent transfers. + +Setting Up Data Endpoints +-------------------------- + +To use data transfer tasks, you need access to Globus Transfer endpoints: + +1. **Personal Endpoints**: Install Globus Connect Personal on your laptop/workstation + (`globus.org/globus-connect-personal `_) + +2. **Institutional Endpoints**: Many HPC centers provide pre-configured Globus endpoints. + Check with your institution's documentation. + +3. **Guest Collections**: Create shareable collections for specific directories + +Visit the `Globus File Manager `_ to view and +manage your endpoints. + +Best Practices +-------------- + +1. **Use Descriptive Endpoint Names**: Clear names make workflows easier to understand + and maintain. + +2. **Check Transfer Success**: Always check the result of transfer/delete tasks: + + .. code-block:: python + + success = transfer_future.result() + assert success, "Transfer failed" + +3. **Handle Permissions**: Ensure you have read permissions on source endpoints and + write permissions on destination endpoints. + +4. **Set Appropriate Timeouts**: Large transfers may need longer timeouts. The default + is 1 hour (3600 seconds). + +5. **Create Dependencies Properly**: Use the ``inputs`` parameter to create non-blocking + dependencies between tasks. Reserve ``.result()`` for when you actually need the data + or must wait synchronously. + +6. **Cleanup Staged Data**: Always delete temporary staged data to avoid filling up + scratch space. + +7. **Test Endpoints First**: Verify endpoints are set up correctly by doing a manual + transfer through the Globus web interface before automating. + +8. **Use Absolute Paths**: Always use absolute paths for both source and destination + to avoid ambiguity. + +Troubleshooting +--------------- + +Transfer Not Starting +^^^^^^^^^^^^^^^^^^^^^ + +- Verify you've authenticated: ``chiltepin login`` +- Check endpoint names/UUIDs are correct +- Ensure both endpoints are activated (visit Globus File Manager) + +Permission Denied +^^^^^^^^^^^^^^^^^ + +- Verify you have read permissions on the source endpoint +- Verify you have write permissions on the destination endpoint +- Some endpoints require explicit activation in the Globus web interface + +Transfer Timing Out +^^^^^^^^^^^^^^^^^^^ + +- Increase the ``timeout`` parameter for large transfers +- Check network connectivity between endpoints +- Verify endpoints are online and not in maintenance mode + +Endpoint Not Found +^^^^^^^^^^^^^^^^^^ + +- Check endpoint name spelling (case-sensitive) +- Try using the endpoint UUID instead of display name +- Verify the endpoint is visible in your Globus File Manager + +See Also +-------- + +- :doc:`tasks` - General task documentation +- :doc:`endpoints` - Globus Compute endpoints for task execution +- :doc:`api` - Full API reference for the data module +- `Globus Documentation `_ - Official Globus guides + diff --git a/docs/endpoints.rst b/docs/endpoints.rst new file mode 100644 index 00000000..a8e11c0e --- /dev/null +++ b/docs/endpoints.rst @@ -0,0 +1,404 @@ +Endpoint Management +=================== + +Chiltepin provides command-line tools for managing Globus Compute endpoint lifecycles. These tools are light +wrappers around the existing Globus Compute CLI, adding convenience features that make it easier to manage +endpoints for use with Chiltepin. This page describes how to configure, start, stop, and manage endpoints. + +Overview +-------- + +Globus Compute endpoints enable you to run tasks on remote compute resources. Chiltepin's CLI simplifies +endpoint management by: + +- Managing authentication flows for all required Globus services at the same time +- Automatically configuring endpoints with appropriate templates matching Chiltepin's configuration options +- Automatically starting endpoints in the background + +The ``chiltepin`` Command +------------------------- + +All endpoint management is done through the ``chiltepin`` command-line interface. + +.. code-block:: bash + + $ chiltepin --help + usage: chiltepin [-h] {login,logout,endpoint} ... + + options: + -h, --help show this help message and exit + + chiltepin commands: + {login,logout,endpoint} + Chiltepin commands + login login to the Chiltepin App + logout logout of the Chiltepin App + endpoint endpoint commands + +Authentication +-------------- + +Before using Globus Compute endpoints, you must authenticate with Globus services. + +Login +^^^^^ + +.. code-block:: bash + + $ chiltepin login + +This will: + +1. Open a browser window (or present a URL) for Globus authentication +2. Save authentication tokens for future use + +.. note:: + Login credentials are cached, so you typically only need to login once per system. + +Logout +^^^^^^ + +.. code-block:: bash + + $ chiltepin logout + +This revokes all authentication tokens and clears cached credentials. + +Endpoint Commands +----------------- + +All endpoint operations use the ``chiltepin endpoint`` subcommand. + +.. code-block:: bash + + $ chiltepin endpoint --help + usage: chiltepin endpoint [-h] [-c CONFIG_DIR] + {configure,list,start,stop,delete} ... + +Configure an Endpoint +^^^^^^^^^^^^^^^^^^^^^ + +Create and configure a new endpoint: + +.. code-block:: bash + + $ chiltepin endpoint configure my-endpoint + +This will: + +1. Create a new endpoint configuration directory (``~/.globus_compute/my-endpoint/``) +2. Generate a default endpoint configuration with a template compatible with Chiltepin's configuration options +3. Set the endpoint display name +4. Enable debug logging +5. Configure the system PATH required for the endpoint environment + +**What Gets Created** + +After configuration, you'll have: + +.. code-block:: text + + ~/.globus_compute/my-endpoint/ + β”œβ”€β”€ config.yaml # Main endpoint configuration + β”œβ”€β”€ user_config_template.yaml.j2 # Jinja2 template for user configs + └── user_environment.yaml # PATH configuration for the endpoint process + +The ``config.yaml`` includes: + +- Display name matching your endpoint name +- Debug mode enabled + +The ``user_config_template.yaml.j2`` includes: + +- User endpoint configuration template with Jinja2 variables for Chiltepin configuration options + +The ``user_environment.yaml`` includes: + +- System PATH settings required for the endpoint process to find necessary executables + +**Custom Configuration Directory** + +By default, endpoints are stored in ``~/.globus_compute/``. You can specify a custom location: + +.. code-block:: bash + + $ chiltepin endpoint -c /path/to/config configure my-endpoint + +List Endpoints +^^^^^^^^^^^^^^ + +View all configured endpoints: + +.. code-block:: bash + + $ chiltepin endpoint list + +With custom configuration directory: + +.. code-block:: bash + + $ chiltepin endpoint -c /path/to/config list + +Output format: + +.. code-block:: text + + endpoint-name endpoint-uuid status + my-endpoint 12345678-1234-1234-1234-123456789abc Running + test-endpoint 87654321-4321-4321-4321-cba987654321 Stopped + +Start an Endpoint +^^^^^^^^^^^^^^^^^ + +Start a configured endpoint: + +.. code-block:: bash + + $ chiltepin endpoint start my-endpoint + +With custom configuration directory: + +.. code-block:: bash + + $ chiltepin endpoint -c /path/to/config start my-endpoint + +The endpoint will: + +1. Register with Globus Compute services (first time only) +2. Start accepting tasks +3. Run in the background +4. Automatically scale resources based on task demand + +**What Happens at Startup** + +- Endpoint UUID is registered with Globus Compute (if first start) +- Background daemon process is started +- Endpoint begins polling for tasks +- User endpoint configurations are validated + +.. tip:: + After starting an endpoint for the first time, note its UUID from ``chiltepin endpoint list``. You'll need this UUID to reference the endpoint in your Chiltepin configuration files. + +Stop an Endpoint +^^^^^^^^^^^^^^^^ + +Stop a running endpoint: + +.. code-block:: bash + + $ chiltepin endpoint stop my-endpoint + +With custom configuration directory: + +.. code-block:: bash + + $ chiltepin endpoint -c /path/to/config stop my-endpoint + +This will: + +1. Stop accepting new tasks +2. Wait for running tasks to complete (graceful shutdown) +3. Terminate the endpoint daemon + +Delete an Endpoint +^^^^^^^^^^^^^^^^^^ + +Remove an endpoint configuration: + +.. code-block:: bash + + $ chiltepin endpoint delete my-endpoint + +With custom configuration directory: + +.. code-block:: bash + + $ chiltepin endpoint -c /path/to/config delete my-endpoint + +This will: + +1. Delete the endpoint configuration directory +2. Remove all associated files +3. Deregister the endpoint UUID from Globus Compute services + +.. warning:: + This operation is permanent and cannot be undone. + +.. note:: + Deleting an endpoint does not stop it if it's currently running. You must stop the endpoint + first before deleting it. + +Endpoint Lifecycle +------------------ + +Typical Workflow +^^^^^^^^^^^^^^^^ + +1. **Configure** - Set up a new endpoint + + .. code-block:: bash + + chiltepin endpoint configure my-hpc-endpoint + +2. **Start** - Launch the endpoint + + .. code-block:: bash + + chiltepin endpoint start my-hpc-endpoint + +3. **Get UUID** - Retrieve the endpoint UUID + + .. code-block:: bash + + chiltepin endpoint list + # Note the UUID for my-hpc-endpoint + +4. **Use in Config** - Add to your Chiltepin YAML configuration + + .. code-block:: yaml + + my-executor: + endpoint: "12345678-1234-1234-1234-123456789abc" + mpi: True + provider: "slurm" + # ... additional options + +5. **Submit Tasks** - Run your workflow + +6. **Stop** - When finished (or let it auto-shutdown) + + .. code-block:: bash + + chiltepin endpoint stop my-hpc-endpoint + +Endpoint Auto-Scaling +^^^^^^^^^^^^^^^^^^^^^ + +Globus Compute endpoints automatically scale based on task demand: + +- **Idle Shutdown**: Endpoints automatically stop after extended idle periods +- **On-Demand Start**: Endpoints can be automatically restarted when new tasks arrive +- **Resource Scaling**: Blocks (resource pools) scale between ``min_blocks`` and ``max_blocks`` + +The endpoint configuration includes: + +- ``idle_heartbeats_soft: 120`` - Shutdown after ~60 minutes of inactivity (at 30s/heartbeat) +- ``idle_heartbeats_hard: 5760`` - Force shutdown after ~48 hours even if tasks are stuck + +User Endpoint Configuration +---------------------------- + +When you use an endpoint UUID in your Chiltepin configuration, the configuration options define +the properties of a resource pool to be allocated and used by that endpoint. Multiple resource pools +can be configured for the same endpoint UUID using different options for different purposes. This +allows you to have a single endpoint for a particular HPC system. That single endpoint can run different +types of tasks with different resource requirements. + +Resource pool properties include: +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Your configuration options define the properties of a resource pool to be allocated and used by that +endpoint. + +**Chiltepin Config**: + +.. code-block:: yaml + + my-executor-1: + endpoint: "uuid-here" + mpi: True + max_mpi_apps: 4 + provider: "slurm" + partition: "gpu" + cores_per_node: 128 + nodes_per_block: 8 + +.. code-block:: yaml + + my-executor-2: + endpoint: "uuid-here" # Same endpoint UUID as my-executor-1 + provider: "slurm" + partition: "service" + cores_per_node: 1 + nodes_per_block: 1 + +Troubleshooting +--------------- + +Check Endpoint Status +^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + $ chiltepin endpoint list + +If an endpoint shows as "Unknown", it may not be running or may have crashed. + +View Endpoint Logs +^^^^^^^^^^^^^^^^^^ + +Logs are stored in the endpoint directory: + +.. code-block:: bash + + $ ls -la ~/.globus_compute/my-endpoint/ + + # View the main endpoint log + $ tail -f ~/.globus_compute/my-endpoint/endpoint.log + +Authentication Issues +^^^^^^^^^^^^^^^^^^^^^ + +If you encounter authentication errors: + +.. code-block:: bash + + $ chiltepin logout + $ chiltepin login + +Tasks Not Running +^^^^^^^^^^^^^^^^^ + +Verify: + +1. Endpoint is running: ``chiltepin endpoint list`` +2. Correct endpoint UUID in your configuration +3. Resource limits (walltime, nodes), endpoint resource pool job may be pending in the scheduler +4. Check endpoint logs for error messages + +Python API +---------- + +You can also manage endpoints programmatically: + +.. code-block:: python + + import chiltepin.endpoint as endpoint + + # Login + clients = endpoint.login() + compute_client = clients["compute"] + + # Configure endpoint + endpoint.configure("my-endpoint") + + # Start endpoint + endpoint.start("my-endpoint") + + # List endpoints + ep_info = endpoint.show() + for name, props in ep_info.items(): + print(f"{name}: {props['id']}") + + # Stop endpoint + endpoint.stop("my-endpoint") + + # Logout + endpoint.logout() + +See Also +-------- + +- :doc:`configuration` - Configuration file format +- :doc:`quickstart` - Complete workflow example +- :doc:`api` - Full Python API reference diff --git a/docs/index.rst b/docs/index.rst index b10dfc3a..5e1de376 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,39 +1,41 @@ Chiltepin Documentation ======================= -.. image:: https://github.com/NOAA-GSL/ExascaleWorkflowSandbox/actions/workflows/docker-slurm.yaml/badge.svg - :target: https://github.com/NOAA-GSL/ExascaleWorkflowSandbox/actions/workflows/docker-slurm.yaml +.. image:: https://github.com/NOAA-GSL/ExascaleWorkflowSandbox/actions/workflows/test-suite.yaml/badge.svg + :target: https://github.com/NOAA-GSL/ExascaleWorkflowSandbox/actions/workflows/test-suite.yaml :alt: ExascaleSandboxTests **Chiltepin** is a Python library for exploring federated workflow capabilities using Parsl and Globus Compute. It provides tools and demonstrations for -implementing exascale scientific workflows on HPC systems. +implementing distributed scientific workflows on HPC systems. .. warning:: This project is for research and exploration purposes only. It is not - intended for production use. + intended for use in operational production environments. Overview -------- -This repository is a collection of tools and demonstrations used to explore and -test various technologies for implementing exascale scientific workflows. The +This repository is a collection of tools and demonstrations used for +implementing distributed exascale scientific workflows. The project focuses on: -* **Federated workflow management** using Parsl -* **Distributed computing** with Globus Compute -* **HPC integration** for multiple NOAA systems (Hera, Hercules, Ursa) +* **Workflow management** using Parsl +* **Federated distributed computing** with Globus Compute +* **HPC integration** of multiple on-prem and/or cloud-based systems * **Container-based testing** with Docker and Slurm Key Features ------------ -* Configuration-based resource management for multiple HPC platforms -* Support for both MPI and non-MPI workflows +* Configuration-based resource management for both HPC platforms and laptops +* Support for both MPI (HPC) and non-MPI (HTC) applications * Globus Compute endpoint management utilities +* Task decorators for seamless integration of Parsl and Globus Compute +* Dynamic distributed task execution across heterogeneous resources * Docker container environment for development and testing -* Comprehensive test suite with 100% coverage for core modules +* Comprehensive test suite with high coverage for core modules Getting Started --------------- @@ -43,6 +45,10 @@ Getting Started installation quickstart + tasks + data + configuration + endpoints testing container diff --git a/docs/installation.rst b/docs/installation.rst index 06b99cc1..4a54a233 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -1,8 +1,8 @@ Installation ============ -This software can be installed on Linux systems. macOS is not currently supported, -but can be used on Macs via the Docker container (see :doc:`container`). +This software can be installed on Linux systems. Native Windows and macOS are not currently supported, +but Chiltepin can be used on these platforms via the Docker container (see :doc:`container`). Prerequisites ------------- @@ -19,7 +19,7 @@ The recommended method for installation is to use a Python virtual environment: $ python -m venv .chiltepin $ source .chiltepin/bin/activate - $ pip install -e .[test] + $ pip install -e ".[test]" .. note:: @@ -34,7 +34,7 @@ Alternatively, you can use a conda environment (anaconda3, miniconda3, miniforge $ conda create -n "chiltepin" python=3.10 $ conda activate chiltepin - $ pip install -e .[test] + $ pip install -e ".[test]" Activating the Environment --------------------------- @@ -59,8 +59,8 @@ Dependencies Chiltepin has the following core dependencies: -* ``globus-compute-sdk`` (4.5.0) -* ``globus-compute-endpoint`` (4.5.0) +* ``globus-compute-sdk`` (>=4.5.0,<4.7.0) +* ``globus-compute-endpoint`` (>=4.5.0,<4.7.0) * ``parsl`` (>=2026.1.5) These will be automatically installed when you install Chiltepin. diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 80417a17..a9e6c214 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -1,82 +1,431 @@ Quick Start =========== -This guide will help you get started with Chiltepin quickly. +This guide walks you through a complete Chiltepin workflow, from setting up an +endpoint to submitting tasks. Overview -------- -Chiltepin is a collection of tools for exploring and testing technologies for -implementing exascale scientific workflows using Parsl and Globus Compute. +Chiltepin is a collection of tools for implementing distributed exascale numerical +weather prediction workflows using Parsl and Globus Compute. .. warning:: - This collection of resources is not intended for production use, and is for - research purposes only. + This collection of resources is not intended for use in operational production + environments, and is for research purposes only. -Basic Usage ------------ +Prerequisites +------------- + +Before starting, ensure you have: + +1. Installed Chiltepin (see :doc:`installation`) +2. Access to an HPC system (or use local execution for testing) +3. A `Globus account `_ and a web browser for Globus authentication + +Complete Workflow Example +-------------------------- + +This example demonstrates the full workflow: configure an endpoint, start it, and submit tasks. + +Step 1: Authenticate +^^^^^^^^^^^^^^^^^^^^ + +First, log in to Globus services. This should be done on the machine where you want to run +tasks: + +.. code-block:: bash + + $ chiltepin login + +This opens a browser for authentication or, if one is not available, provides a URL to complete +the authentication manually. Follow the prompts to authorize Chiltepin. + +Step 2: Configure an Endpoint +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Create a new Globus Compute endpoint to which you will submit tasks. This should be done on the +machine where you want to run tasks: + +.. code-block:: bash + + $ chiltepin endpoint configure my-endpoint + +This creates the endpoint configuration in ``~/.globus_compute/my-endpoint/``. + +Step 3: Start the Endpoint +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Launch the endpoint: + +.. code-block:: bash + + $ chiltepin endpoint start my-endpoint + +The endpoint will register with Globus Compute and begin accepting tasks. + +Step 4: Get the Endpoint UUID +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Retrieve your endpoint's UUID: + +.. code-block:: bash + + $ chiltepin endpoint list + +Example output: + +.. code-block:: text + + my-endpoint a1b2c3d4-1234-5678-90ab-cdef12345678 Running + +Note the UUID (``a1b2c3d4-1234-5678-90ab-cdef12345678``) for the next step. + +Step 5: Create a Configuration File +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Create ``my_config.yaml`` with your endpoint UUID: + +.. code-block:: yaml -After installing Chiltepin (see :doc:`installation`), you can start using it by -importing the modules you need: + # Local resource for small tasks + local: + provider: "localhost" + init_blocks: 1 + max_blocks: 1 + + # Remote endpoint for HPC tasks + remote: + endpoint: "a1b2c3d4-1234-5678-90ab-cdef12345678" # Use your UUID + provider: "slurm" + cores_per_node: 4 + nodes_per_block: 1 + partition: "compute" + account: "myproject" + walltime: "00:30:00" + environment: + - "module load python/3.11" + +Replace the endpoint UUID with your actual UUID from Step 4. + +Step 6: Write Your Workflow +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Create ``my_workflow.py``: .. code-block:: python + import parsl import chiltepin.configure - import chiltepin.tasks - import chiltepin.endpoint + from chiltepin.tasks import bash_task, python_task + + # Define tasks + @python_task + def hello_local(): + import platform + return f"Hello from {platform.node()}" + + @bash_task + def hello_remote(): + return "hostname" + + @python_task + def compute_task(n): + """Simple computation task""" + result = sum(i**2 for i in range(n)) + return result + + if __name__ == "__main__": + # Load configuration + config_dict = chiltepin.configure.parse_file("my_config.yaml") + parsl_config = chiltepin.configure.load( + config_dict, + include=["local", "remote"], + run_dir="./runinfo" + ) + + with parsl.load(parsl_config): + # Run local task on "local" resource + local_future = hello_local(executor="local") + + # Run remote bash task on "remote" resource (returns exit code: 0 = success) + remote_future = hello_remote(executor="remote") + + # Run multiple compute tasks on "remote" resource + futures = [compute_task(i, executor="remote") for i in range(1, 5)] + + # Get the results + print(f"Local: {local_future.result()}") + print(f"Remote exit code: {remote_future.result()}") + print(f"Computation results: {[f.result() for f in futures]}") + + print("All tasks completed!") -Configuration -------------- +Step 7: Run Your Workflow +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Execute the workflow: -Chiltepin uses YAML configuration files to define compute resources. Example -configurations for various platforms can be found in the ``tests/configs/`` directory: +.. code-block:: bash -* ``docker.yaml`` - For Docker container environments -* ``hera.yaml`` - For NOAA Hera HPC system -* ``hercules.yaml`` - For NOAA Hercules HPC system -* ``ursa.yaml`` - For NOAA Ursa HPC system + $ python my_workflow.py -You can load a configuration using: +Expected output: + +.. code-block:: text + + Local: Hello from my-laptop.local + Remote exit code: 0 + Computation results: [0, 1, 5, 14] + All tasks completed! + +Step 8: Stop the Endpoint +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When finished: + +.. code-block:: bash + + $ chiltepin endpoint stop my-endpoint + +.. note:: + Endpoints automatically scale down resources after idle periods, so manual stopping is + optional. + +Local-Only Quickstart +--------------------- + +For testing without an HPC system, use local execution: + +Configuration File (``local_config.yaml``) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: yaml + + local: + provider: "localhost" + init_blocks: 1 + max_blocks: 1 + +Simple Workflow (``simple_workflow.py``) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: python + import parsl import chiltepin.configure + from chiltepin.tasks import bash_task, python_task + + # Define tasks + @python_task + def multiply(a, b): + return a * b + + @bash_task + def system_info(): + return "echo 'Task completed successfully'" + + if __name__ == "__main__": + # Load configuration + config_dict = chiltepin.configure.parse_file("local_config.yaml") + parsl_config = chiltepin.configure.load(config_dict, run_dir="./runinfo") + + with parsl.load(parsl_config): + result = multiply(6, 7, executor="local").result() + print(f"6 * 7 = {result}") + + exit_code = system_info(executor="local").result() + print(f"Bash task exit code: {exit_code}") + +Run it: + +.. code-block:: bash + + $ python simple_workflow.py + +Working with MPI Tasks +---------------------- + +Chiltepin supports MPI applications on HPC systems: + +Configuration (``mpi_config.yaml``) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: yaml + + mpi-resource-name: + endpoint: "your-endpoint-uuid" + mpi: True + max_mpi_apps: 2 + mpi_launcher: "srun" + provider: "slurm" + cores_per_node: 128 + nodes_per_block: 4 + partition: "compute" + account: "myproject" + walltime: "01:00:00" + environment: + - "module load openmpi/4.1" + - "export MPIF90=$MPIF90" + +MPI Workflow +^^^^^^^^^^^^ + +.. code-block:: python + import parsl + import chiltepin.configure + from chiltepin.tasks import bash_task + + @bash_task + def compile_mpi(): + return "$MPIF90 -o mpi_app mpi_app.f90" + + @bash_task + def run_mpi(ranks=4): + return f"srun -n {ranks} ./mpi_app" - # Parse the Chiltepin configuration file - config = chiltepin.configure.parse_file("tests/configs/docker.yaml") + if __name__ == "__main__": + config_dict = chiltepin.configure.parse_file("mpi_config.yaml") + parsl_config = chiltepin.configure.load(config_dict, run_dir="./runinfo") + + with parsl.load(parsl_config): + # Compile MPI application on the MPI resource (returns exit code) + compile_result = compile_mpi(executor="mpi-resource-name").result() + print(f"Compilation exit code: {compile_result}") + + # Run with different rank counts on the MPI resource + results = [] + for ranks in [4, 8, 16]: + future = run_mpi(ranks, executor="mpi-resource-name") + results.append(future.result()) + + for i, result in enumerate(results, 1): + print(f"Run {i} exit code: {result}") + +Key Concepts +------------ + +Resources +^^^^^^^^^ + +Resources define where and how tasks run: + +- **Local**: Runs on the current machine +- **HPC**: Submits jobs to schedulers (Slurm, PBS Pro) +- **Globus Compute**: Runs on remote endpoints + +See :doc:`configuration` for detailed resource configuration options. + +Task Decorators +^^^^^^^^^^^^^^^ + +Chiltepin provides three task decorators to define workflow tasks: + +- ``@python_task``: Execute Python functions +- ``@bash_task``: Execute shell commands (returns exit code) +- ``@join_task``: Coordinate multiple tasks without blocking + +When calling a task, use the ``executor`` parameter to specify which resource to use: + +.. code-block:: python + + @python_task + def my_task(): + return "result" - # Create a Parsl Config from the Chiltepin configuration + # Specify which resource to use + result = my_task(executor="compute").result() + +The ``executor`` value must match a resource name from your configuration file. + +.. seealso:: + For comprehensive documentation on defining and using tasks, including advanced + patterns, error handling, and best practices, see :doc:`tasks`. + +Configuration Loading +^^^^^^^^^^^^^^^^^^^^^ + +The ``include`` parameter selects specific resources to load from the configuration: + +.. code-block:: python + + # Load only specific resources parsl_config = chiltepin.configure.load( - config, - include=["service"], + config_dict, + include=["local", "compute"], # Only these resources run_dir="./runinfo" ) - # Load the Parsl configuration to initialize executors - parsl.load(parsl_config) +If ``include`` is omitted, all resources in the configuration are loaded. + +Directory Structure +------------------- + +After running workflows, you'll see: + +.. code-block:: text + + . + β”œβ”€β”€ my_config.yaml # Configuration file + β”œβ”€β”€ my_workflow.py # Workflow script + └── runinfo/ # Parsl runtime directory + β”œβ”€β”€ 000/ # Run directory + β”‚ β”œβ”€β”€ local/ # Local resource files + β”‚ β”œβ”€β”€ remote/ # Remote resource files + β”‚ └── submit_scripts/ # Job submission scripts + └── parsl.log # Parsl log file -Working with Tasks ------------------- +The ``runinfo`` directory contains execution logs, job scripts, and task outputs. -Chiltepin provides decorators for creating tasks that can run on remote executors: +Troubleshooting +--------------- + +Tasks Not Running +^^^^^^^^^^^^^^^^^ + +1. Verify endpoint is running: ``chiltepin endpoint list`` +2. Check you're using the correct endpoint UUID +3. Review logs in ``runinfo/`` directory +4. Check endpoint logs: ``~/.globus_compute/my-endpoint/endpoint.log`` + +Authentication Expired +^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + $ chiltepin logout + $ chiltepin login + +Configuration Errors +^^^^^^^^^^^^^^^^^^^^ + +Validate your YAML syntax: .. code-block:: python - from chiltepin.tasks import bash_task, python_task - - @python_task - def hello(): - return "Hello from Chiltepin!" - - @bash_task - def run_command(): - return "echo 'Running on remote executor'" + import yaml + with open("my_config.yaml") as f: + config = yaml.safe_load(f) + print(config) + +Resource Limits +^^^^^^^^^^^^^^^ + +If jobs fail to start: + +- Check partition/queue names +- Verify account/project is valid +- Confirm node/core requests are within limits +- Machine may be busy and resource pool job may be pending or may be full Next Steps ---------- -* Learn how to run the test suite: :doc:`testing` -* Set up the Docker container environment: :doc:`container` -* Explore the full API documentation: :doc:`api` +* Comprehensive task documentation: :doc:`tasks` +* Detailed configuration options: :doc:`configuration` +* Endpoint management: :doc:`endpoints` +* Run the test suite: :doc:`testing` +* Set up Docker environment: :doc:`container` +* Explore the API: :doc:`api` diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 53fc1f32..00000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -sphinx==7.1.2 -sphinx-rtd-theme==1.3.0rc1 diff --git a/docs/tasks.rst b/docs/tasks.rst new file mode 100644 index 00000000..6b5b6c7c --- /dev/null +++ b/docs/tasks.rst @@ -0,0 +1,870 @@ +Tasks +===== + +Chiltepin provides decorators to define workflow tasks that can be executed on +configured resources. Tasks are the fundamental units of work in a Chiltepin workflow. + +.. note:: + Chiltepin's task decorators are thin wrappers around Parsl's ``@python_app``, + ``@bash_app``, and ``@join_app`` decorators. Chiltepin adds two key capabilities: + + 1. **Method decoration**: Support for decorating class methods that reference ``self`` + 2. **Dynamic resource selection**: Ability to choose the execution resource at runtime + via the ``executor`` parameter + + For more information about Parsl's execution model and features, see the + `Parsl documentation `_. + +Overview +-------- + +Chiltepin offers three task decorators: + +- **@python_task**: Execute Python functions as workflow tasks +- **@bash_task**: Execute shell commands as workflow tasks +- **@join_task**: Coordinate multiple tasks without blocking workflow execution + +When you decorate a function with one of these decorators, it becomes a workflow task +that can be submitted for execution on any configured resource. The function itself +defines *what* to execute, while the ``executor`` parameter at call time specifies +*where* to execute it. + +Python Tasks +------------ + +The ``@python_task`` decorator transforms a Python function into a workflow task. +The function will be serialized and executed on the specified resource. + +Basic Usage +^^^^^^^^^^^ + +.. code-block:: python + + from chiltepin.tasks import python_task + + @python_task + def hello_world(): + return "Hello from a Chiltepin task!" + + # Call the task and specify where to run it + future = hello_world(executor="my-resource") + result = future.result() # Wait for completion and get result + print(result) # "Hello from a Chiltepin task!" + +Tasks with Arguments +^^^^^^^^^^^^^^^^^^^^ + +Python tasks can accept both positional and keyword arguments: + +.. code-block:: python + + @python_task + def add_numbers(a, b, multiply_by=1): + return (a + b) * multiply_by + + # Use positional arguments + future1 = add_numbers(5, 3, executor="compute") + print(future1.result()) # 8 + + # Use keyword arguments + future2 = add_numbers(5, 3, multiply_by=2, executor="compute") + print(future2.result()) # 16 + +Importing Modules +^^^^^^^^^^^^^^^^^ + +Since tasks may execute on remote systems, import statements should be inside the +function: + +.. code-block:: python + + @python_task + def get_hostname(): + import platform + return platform.node() + + @python_task + def process_data(filename): + import pandas as pd + import numpy as np + + df = pd.read_csv(filename) + return np.mean(df['values']) + +Return Values +^^^^^^^^^^^^^ + +Python tasks can return any serializable Python object: + +.. code-block:: python + + @python_task + def get_list(n): + return list(range(n)) + + @python_task + def get_dict(key, value): + return {key: value} + + @python_task + def get_dataframe(): + import pandas as pd + return pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + + list_result = get_list(5, executor="local").result() + dict_result = get_dict("temperature", 72.5, executor="local").result() + df_result = get_dataframe(executor="compute").result() + +Bash Tasks +---------- + +The ``@bash_task`` decorator transforms a function into a shell command workflow task. +The function must return a string containing the bash commands to execute. + +Basic Usage +^^^^^^^^^^^ + +.. code-block:: python + + from chiltepin.tasks import bash_task + + @bash_task + def echo_hello(): + return "echo 'Hello from bash!'" + + # Bash tasks return the exit code (0 = success) + future = echo_hello(executor="my-resource") + exit_code = future.result() + print(exit_code) # 0 + +Dynamic Command Generation +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Use function arguments to dynamically construct commands: + +.. code-block:: python + + @bash_task + def process_file(input_file, output_file): + return f"cat {input_file} | sort | uniq > {output_file}" + + @bash_task + def run_simulation(config_file, num_steps): + return f"./my_simulator --config {config_file} --steps {num_steps}" + + exit_code = process_file("data.txt", "sorted.txt", executor="compute").result() + +.. warning:: + Be careful with shell injection vulnerabilities. Validate and sanitize inputs + when constructing shell commands from user-provided data. + +Capturing Output +^^^^^^^^^^^^^^^^ + +By default, bash tasks return the exit code. To capture stdout or stderr, use the +``stdout`` and ``stderr`` parameters that are automatically added to bash tasks: + +.. code-block:: python + + @bash_task + def get_hostname(): + return "hostname" + + # Capture stdout to a file + future = get_hostname( + executor="compute", + stdout="hostname_output.txt" + ) + exit_code = future.result() + + # Read the captured output + with open("hostname_output.txt") as f: + hostname = f.read().strip() + print(f"Task ran on: {hostname}") + +.. note:: + When running tasks on remote resources (via Globus Compute endpoints), output files + are created on the remote system, not on the local host. You'll need to use shared + filesystems, data staging, or file transfer mechanisms to access these files locally. + +You can also capture stderr for debugging: + +.. code-block:: python + + @bash_task + def risky_command(): + return "some_command_that_might_fail" + + future = risky_command( + executor="compute", + stdout="output.txt", + stderr="errors.txt" + ) + +Multi-line Commands +^^^^^^^^^^^^^^^^^^^ + +For complex bash scripts, return multi-line strings: + +.. code-block:: python + + @bash_task + def setup_and_run(workdir): + return f""" + mkdir -p {workdir} + cd {workdir} + git clone https://github.com/example/repo.git + cd repo + make + ./run_tests.sh + """ + +Join Tasks +---------- + +The ``@join_task`` decorator creates tasks that coordinate other tasks without blocking +the main workflow. Join tasks can launch multiple subtasks and depend on their results. + +Basic Usage +^^^^^^^^^^^ + +Join tasks call other tasks and return futures: + +.. code-block:: python + + from chiltepin.tasks import python_task, join_task + + @python_task + def multiply(x, factor): + return x * factor + + @python_task + def add_values(*values): + return sum(values) + + @join_task + def process_list(values, factor): + # Launch multiple tasks in parallel + futures = [multiply(v, factor, executor="compute") for v in values] + # Aggregate results with another task + return add_values(*futures, executor="compute") + + # Process [1, 2, 3] with factor 2: (1*2) + (2*2) + (3*2) = 12 + result = process_list([1, 2, 3], 2).result() + print(result) # 12 + +When to Use Join Tasks +^^^^^^^^^^^^^^^^^^^^^^ + +Use join tasks when you need to: + +1. **Fan-out operations**: Launch many parallel tasks based on input data +2. **Task dependencies**: Chain tasks where one depends on another's output +3. **Dynamic workflows**: Create tasks based on runtime conditions + +Example - Processing Multiple Files: + +.. code-block:: python + + @bash_task + def process_file(filepath): + return f"./process.sh {filepath}" + + @python_task + def check_all_success(exit_codes): + return all(code == 0 for code in exit_codes) + + @join_task + def process_all_files(file_list): + # Process all files in parallel + futures = [process_file(f, executor="compute") for f in file_list] + # Check if all succeeded + return check_all_success(futures, executor="local") + + files = ["data1.txt", "data2.txt", "data3.txt"] + success = process_all_files(files).result() + +Mixing Task Types +^^^^^^^^^^^^^^^^^ + +Join tasks can coordinate both python and bash tasks: + +.. code-block:: python + + @bash_task + def compile_code(): + return "gcc -o myapp myapp.c" + + @bash_task + def run_app(input_file): + return f"./myapp {input_file}" + + @python_task + def parse_results(output_file): + with open(output_file) as f: + return float(f.read().strip()) + + @join_task + def compile_and_run(input_file): + # First compile + compile_future = compile_code(executor="compute") + compile_future.result() # Wait for compilation + + # Then run + run_future = run_app(input_file, executor="compute", stdout="output.txt") + run_future.result() # Wait for execution + + # Parse results + return parse_results("output.txt", executor="local") + + result = compile_and_run("input.dat").result() + +Tasks as Class Methods +---------------------- + +All task decorators work with both standalone functions and class methods. This enables +object-oriented workflow design: + +.. code-block:: python + + from chiltepin.tasks import python_task, bash_task + + class DataProcessor: + def __init__(self, config): + self.config = config + + @python_task + def load_data(self, filename): + import pandas as pd + # Can access self and instance variables + return pd.read_csv(filename, **self.config) + + @python_task + def transform_data(self, df): + # Use instance configuration + if self.config.get('normalize'): + return (df - df.mean()) / df.std() + return df + + @bash_task + def export_data(self, output_file): + # self is available in method tasks + format_type = self.config.get('export_format', 'csv') + return f"convert_data --format {format_type} -o {output_file}" + + # Create instance and use tasks + processor = DataProcessor({'normalize': True, 'export_format': 'json'}) + data = processor.load_data("input.csv", executor="compute").result() + transformed = processor.transform_data(data, executor="compute").result() + exit_code = processor.export_data("output.json", executor="compute").result() + +.. warning:: + **Mutable Object State**: When using class methods as tasks, be aware that mutable + object state can lead to non-deterministic behavior in distributed systems. Each task + captures the object state at the time it's submitted. If the object's state is modified + between task submissions (e.g., updating ``self.config``), different tasks may see + different states, leading to unexpected results. For best reliability, use immutable + configuration or pass state explicitly as task arguments rather than relying on + mutable instance variables. + +Specifying Resources +-------------------- + +The ``executor`` parameter determines where a task runs. This parameter refers to +resource names defined in your configuration file. + +.. note:: + The parameter is called ``executor`` due to Parsl's API, but it specifies which + resource to use, not an "executor" in the traditional programming sense. + +Single Resource +^^^^^^^^^^^^^^^ + +Specify a single resource by name: + +.. code-block:: python + + @python_task + def my_task(): + return "result" + + # Run on the "compute" resource from your config + future = my_task(executor="compute") + +Multiple Resources +^^^^^^^^^^^^^^^^^^ + +Provide a list of resource names to allow Parsl to choose based on availability: + +.. code-block:: python + + # Can run on either resource + future = my_task(executor=["compute", "backup-compute"]) + +Default Executor +^^^^^^^^^^^^^^^^ + +If you omit the ``executor`` parameter, the task can run on any configured resource: + +.. code-block:: python + + # Can run on any available resource + future = my_task() + +.. tip:: + For production workflows, explicitly specify resources to ensure tasks run where + intended (e.g., GPU tasks on GPU resources, MPI tasks on MPI resources). + +Futures and Results +------------------- + +Task calls return ``AppFuture`` objects, which represent asynchronous computation. + +Getting Results +^^^^^^^^^^^^^^^ + +Call ``.result()`` to wait for task completion and retrieve the result: + +.. code-block:: python + + @python_task + def compute_value(): + import time + time.sleep(2) + return 42 + + future = compute_value(executor="compute") + print("Task submitted, doing other work...") + + # This blocks until the task completes + result = future.result() + print(f"Result: {result}") + +Checking Status +^^^^^^^^^^^^^^^ + +Check if a task is done without blocking: + +.. code-block:: python + + future = compute_value(executor="compute") + + if future.done(): + print("Task completed!") + print(future.result()) + else: + print("Task still running...") + +Multiple Futures +^^^^^^^^^^^^^^^^ + +Wait for multiple tasks efficiently: + +.. code-block:: python + + # Launch multiple tasks + futures = [compute_value(executor="compute") for _ in range(10)] + + # Wait for all to complete + results = [f.result() for f in futures] + print(f"All done: {results}") + +Exception Handling +^^^^^^^^^^^^^^^^^^ + +Exceptions raised in tasks are re-raised when calling ``.result()``: + +.. code-block:: python + + @python_task + def failing_task(): + raise ValueError("Something went wrong!") + + future = failing_task(executor="compute") + + try: + result = future.result() + except ValueError as e: + print(f"Task failed: {e}") + +File and Data Handling +---------------------- + +Working with Files +^^^^^^^^^^^^^^^^^^ + +Tasks can read and write files, but file paths must be accessible from the resource +where the task runs: + +.. code-block:: python + + @python_task + def process_file(input_path, output_path): + with open(input_path) as f: + data = f.read() + + processed = data.upper() + + with open(output_path, 'w') as f: + f.write(processed) + + return output_path + +.. warning:: + When running on remote resources via Globus Compute, ensure files are accessible + on the remote system. You may need to stage files or use shared filesystems. + +Data Transfer Between Endpoints +"""""""""""""""""""""""""""""""" + +For moving data between Globus Transfer endpoints, Chiltepin provides specialized data +transfer and deletion tasks that can be incorporated into your workflows: + +.. code-block:: python + + from chiltepin.data import transfer_task, delete_task + from chiltepin.tasks import python_task + + @python_task + def process_file(transfer_complete, input_path): + # transfer_complete is a boolean we can check or ignore + # The important part is passing it creates a dependency + with open(input_path) as f: + data = f.read() + return len(data) + + # Transfer data between Globus Transfer endpoints + transfer = transfer_task( + src_ep="my-source-endpoint", + dst_ep="my-dest-endpoint", + src_path="/data/input.dat", + dst_path="/scratch/input.dat", + executor="local" + ) + + # Process the transferred data (waits for transfer by passing its future) + result = process_file(transfer, "/scratch/input.dat", executor="compute") + + # Clean up after processing + cleanup = delete_task( + src_ep="my-dest-endpoint", + src_path="/scratch/input.dat", + executor="local", + inputs=[result] # Waits for processing to complete + ) + cleanup.result() + +These tasks operate on Globus **Transfer endpoints** (which are different from +Globus Compute endpoints used for execution). See :doc:`data` for comprehensive +documentation on data transfer and deletion tasks. + +Passing Data Between Tasks +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pass data directly through futures: + +.. code-block:: python + + @python_task + def generate_data(n): + return list(range(n)) + + @python_task + def sum_data(data): + return sum(data) + + # Data flows through futures + data_future = generate_data(100, executor="compute") + sum_future = sum_data(data_future, executor="compute") + result = sum_future.result() + +For large data, consider files or data staging strategies. + +Advanced Topics +--------------- + +Environment Variables +^^^^^^^^^^^^^^^^^^^^^ + +Access environment variables in tasks: + +.. code-block:: python + + @python_task + def get_user(): + import os + return os.environ.get('USER', 'unknown') + + @bash_task + def get_user_bash(): + return "echo ${USER:-unknown}" + +Task Dependencies +^^^^^^^^^^^^^^^^^ + +Create task dependencies to ensure tasks execute in the correct order. There are +two primary methods for establishing dependencies between tasks. + +Passing Futures as Arguments +""""""""""""""""""""""""""""" + +The most common approach is to pass a future from one task as an argument to another: + +.. code-block:: python + + @python_task + def step1(): + return "result1" + + @python_task + def step2(input_data): + return f"processed_{input_data}" + + @python_task + def step3(input_data): + return f"final_{input_data}" + + # Chain tasks - data flows through futures + future1 = step1(executor="compute") + future2 = step2(future1, executor="compute") # Waits for future1 + future3 = step3(future2, executor="compute") # Waits for future2 + + final_result = future3.result() + +Using the inputs Parameter +""""""""""""""""""""""""""" + +For dependencies where you don't need to pass data between tasks, use the ``inputs`` +parameter. This is automatically supported by all Chiltepin task decorators +(via Parsl's underlying implementation): + +.. code-block:: python + + from chiltepin.data import delete_task, transfer_task + from chiltepin.tasks import python_task + + # Stage data to compute resource + stage = transfer_task( + src_ep="laptop", + dst_ep="hpc-scratch", + src_path="/data/input.dat", + dst_path="/scratch/input.dat", + executor="local" + ) + + # Process the data - waits for transfer without passing its result + @python_task + def process_data(filepath): + with open(filepath) as f: + return len(f.read()) + + result = process_data("/scratch/input.dat", executor="compute", inputs=[stage]) + + # Clean up - waits for processing to complete + cleanup = delete_task( + src_ep="hpc-scratch", + src_path="/scratch/input.dat", + executor="local", + inputs=[result] + ) + +The ``inputs`` parameter accepts a list of futures that must complete before the task +starts. This is particularly useful for: + +- Ensuring files are transferred before processing begins +- Coordinating cleanup operations after processing completes +- Creating dependencies when you don't need to pass data between tasks +- Coordinating multiple independent prerequisites + +Multiple Dependencies +""""""""""""""""""""" + +You can combine both approaches and specify multiple dependencies: + +.. code-block:: python + + @python_task + def task_a(): + return "data_a" + + @python_task + def task_b(): + return "data_b" + + @python_task + def task_c(): + # Just needs to wait, doesn't use the result + pass + + @python_task + def combine(data1, data2): + return f"{data1}_{data2}" + + a = task_a(executor="compute") + b = task_b(executor="compute") + c = task_c(executor="compute") + + # Combine waits for a and b (via arguments) and c (via inputs) + result = combine(a, b, executor="compute", inputs=[c]) + +.. tip:: + **Avoid premature .result() calls**: In this example, notice that ``.result()`` is only + called once at the very end. By passing futures directly as arguments instead of calling + ``.result()`` immediately, you allow Parsl to manage task dependencies automatically and + schedule tasks as soon as their dependencies complete. This maximizes parallelism. + + **Bad practice** (blocks unnecessarily): + + .. code-block:: python + + result1 = step1(executor="compute").result() # Blocks here + result2 = step2(result1, executor="compute").result() # Blocks here + result3 = step3(result2, executor="compute").result() # Blocks here + + **Good practice** (maximizes parallelism): + + .. code-block:: python + + future1 = step1(executor="compute") + future2 = step2(future1, executor="compute") # Scheduled, doesn't block + future3 = step3(future2, executor="compute") # Scheduled, doesn't block + result = future3.result() # Only block when you need the final result + +Timeout Handling +^^^^^^^^^^^^^^^^ + +Handle long-running tasks with timeouts: + +.. code-block:: python + + from concurrent.futures import TimeoutError + + @python_task + def long_task(): + import time + time.sleep(100) + return "done" + + future = long_task(executor="compute") + + try: + result = future.result(timeout=10) # Wait max 10 seconds + except TimeoutError: + print("Task timed out") + +Retry Logic +^^^^^^^^^^^ + +Implement retry logic for unreliable tasks: + +.. code-block:: python + + def run_with_retry(task_func, *args, max_retries=3, **kwargs): + for attempt in range(max_retries): + try: + future = task_func(*args, **kwargs) + return future.result() + except Exception as e: + if attempt == max_retries - 1: + raise + print(f"Attempt {attempt + 1} failed: {e}, retrying...") + +Best Practices +-------------- + +1. **Keep tasks pure**: Avoid side effects when possible. Tasks should transform inputs + to outputs predictably. + +2. **Import inside tasks**: Always import modules inside task functions, not at the + module level, to ensure imports work on remote systems. + +3. **Specify resources explicitly**: Use the ``executor`` parameter to control where + tasks run, especially for resource-specific requirements (GPU, MPI, etc.). + +4. **Handle errors gracefully**: Wrap ``.result()`` calls in try-except blocks for + production workflows. + +5. **Use join tasks for coordination**: Don't block the main thread waiting for + results. Let join tasks coordinate dependencies. + +6. **Validate bash commands**: Sanitize inputs when constructing bash commands to + avoid shell injection vulnerabilities. + +7. **Use descriptive task names**: Function names should clearly indicate what the + task does for easier debugging. + +Common Patterns +--------------- + +Map-Reduce +^^^^^^^^^^ + +.. code-block:: python + + @python_task + def map_task(item): + return item ** 2 + + @python_task + def reduce_task(results): + return sum(results) + + @join_task + def map_reduce(items): + # Map phase + futures = [map_task(item, executor="compute") for item in items] + # Reduce phase + return reduce_task(futures, executor="compute") + + result = map_reduce([1, 2, 3, 4, 5]).result() # 55 + +Pipeline Processing +^^^^^^^^^^^^^^^^^^^ + +.. code-block:: python + + @python_task + def stage1(data): + return data * 2 + + @python_task + def stage2(data): + return data + 10 + + @python_task + def stage3(data): + return data ** 2 + + # Create pipeline + data1 = stage1(5, executor="compute") + data2 = stage2(data1, executor="compute") + result = stage3(data2, executor="compute").result() # ((5*2)+10)^2 = 400 + +Parameter Sweep +^^^^^^^^^^^^^^^ + +.. code-block:: python + + @python_task + def run_experiment(param1, param2): + # Run simulation with parameters + result = param1 * param2 + return {"params": (param1, param2), "result": result} + + # Sweep over parameter space + futures = [] + for p1 in [1, 2, 3]: + for p2 in [10, 20, 30]: + future = run_experiment(p1, p2, executor="compute") + futures.append(future) + + # Collect all results + results = [f.result() for f in futures] + +See Also +-------- + +- :doc:`quickstart` - Quick introduction to tasks in a complete workflow +- :doc:`configuration` - Configuring resources where tasks execute +- :doc:`api` - Full API reference including task decorator signatures + diff --git a/docs/testing.rst b/docs/testing.rst index 3d2cd4f0..417731e4 100644 --- a/docs/testing.rst +++ b/docs/testing.rst @@ -2,27 +2,27 @@ Testing ======= The Chiltepin test suite uses pytest and requires an editable installation of the -package (achieved using the ``pip install -e .`` installation step). +package (achieved using the ``pip install -e ".[test]"`` installation step). Prerequisites ------------- Before running tests, ensure you have: -1. Installed Chiltepin with test dependencies: ``pip install -e .[test]`` -2. Authenticated with Globus Compute (for Globus Compute tests) +1. Installed Chiltepin with test dependencies: ``pip install -e ".[test]"`` +2. Authenticated with Globus (for Globus Compute and Globus Transfer tests) Globus Authentication --------------------- -The Globus Compute tests require authentication before running. Use the following +The Globus tests require authentication before running. Use the following command to authenticate: .. code-block:: console - $ globus-compute-endpoint login + $ chiltepin login -This will open a web browser for you to complete the authentication flow. +This will open a web browser, or provide a URL, for you to complete the authentication flow. Running Tests ------------- @@ -43,6 +43,9 @@ Where ```` is one of: * ``hercules`` - For NOAA Hercules HPC system * ``ursa`` - For NOAA Ursa HPC system +You can also provide a custom configuration file if you have specific settings or want +to test against a different environment. + Verbose Output ~~~~~~~~~~~~~~ @@ -70,13 +73,15 @@ To run a specific test function: Coverage Reports ---------------- -To run tests with coverage: +To run tests and generate a coverage report: .. code-block:: console $ pytest --cov=src/chiltepin --cov-report=term --config=tests/configs/.yaml -This will display a coverage report showing which lines of code were executed during tests. +This will display a coverage report showing the percentage of lines of code that were +executed during tests. The line numbers of uncovered code will also be displayed for +each file. Test Organization ----------------- @@ -85,13 +90,13 @@ The test suite is organized into several files: * ``test_configure.py`` - Tests for configuration parsing and executor creation * ``test_cli.py`` - Tests for command-line interface functionality -* ``test_tasks.py`` - Tests for task decorators and execution +* ``test_tasks.py`` - Tests for task decorators * ``test_endpoint.py`` - Tests for Globus Compute endpoint management * ``test_data.py`` - Tests for data handling utilities * ``test_parsl_hello.py`` - Basic Parsl integration tests -* ``test_parsl_mpi.py`` - MPI-enabled Parsl tests -* ``test_globus_compute_hello.py`` - Basic Globus Compute tests -* ``test_globus_compute_mpi.py`` - MPI-enabled Globus Compute tests +* ``test_parsl_mpi.py`` - MPI-enabled Parsl integration tests +* ``test_globus_compute_hello.py`` - Basic Globus Compute integration tests +* ``test_globus_compute_mpi.py`` - MPI-enabled Globus Compute integration tests Docker Container Testing ------------------------ diff --git a/pyproject.toml b/pyproject.toml index e0dd95a9..9869d502 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + [build-system] requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" @@ -19,12 +21,13 @@ maintainers = [ ] description = "Federated NWP Workflow Tools" readme = "README.md" -license = {file = "LICENSE"} +license = {text = "Apache-2.0"} keywords = ["federated", "workflow", "parsl", "globus-compute", "hpc", "scientific-computing"] classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Science/Research", "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", "Operating System :: POSIX :: Linux", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", @@ -40,7 +43,10 @@ classifiers = [ Homepage = "https://github.com/NOAA-GSL/ExascaleWorkflowSandbox" Documentation = "https://exascaleworkflowsandbox.readthedocs.io" Repository = "https://github.com/NOAA-GSL/ExascaleWorkflowSandbox" +"Source Code" = "https://github.com/NOAA-GSL/ExascaleWorkflowSandbox" Issues = "https://github.com/NOAA-GSL/ExascaleWorkflowSandbox/issues" +"Bug Tracker" = "https://github.com/NOAA-GSL/ExascaleWorkflowSandbox/issues" +Changelog = "https://github.com/NOAA-GSL/ExascaleWorkflowSandbox/releases" [project.optional-dependencies] test = [ @@ -48,6 +54,10 @@ test = [ "pytest-cov", "ruff", ] +docs = [ + "sphinx==7.1.2", + "sphinx-rtd-theme==1.3.0rc1", +] [project.scripts] chiltepin = "chiltepin.cli:main" diff --git a/recipe/README.md b/recipe/README.md new file mode 100644 index 00000000..adcc864d --- /dev/null +++ b/recipe/README.md @@ -0,0 +1,95 @@ +# Conda Recipe for Chiltepin + +This directory contains the conda-forge recipe for the Chiltepin package. + +## Testing the Recipe Locally + +Before submitting to conda-forge, test the recipe locally: + +```bash +# Install conda-build if needed +conda install conda-build + +# Build the recipe +conda build recipe/ + +# Test install +conda create -n test-chiltepin chiltepin --use-local +conda activate test-chiltepin +chiltepin --help +``` + +## Submitting to conda-forge + +1. **Publish to PyPI first** (conda-forge prefers PyPI sources): + ```bash + python -m build + twine upload dist/* + ``` + +2. **Update the SHA256 hash** in `meta.yaml`: + ```bash + # After publishing to PyPI, download and compute hash + pip download chiltepin==0.1.0 --no-deps --no-binary :all: + sha256sum chiltepin-0.1.0.tar.gz + # Copy the hash and update the sha256 field in meta.yaml + ``` + + **Why PyPI tarball instead of git?** + - PyPI is the official distribution channel for Python packages + - Tarballs are immutable - the hash ensures content never changes + - Faster builds (no git history to clone) + - This is the conda-forge recommended practice + +3. **Fork staged-recipes**: + - Go to https://github.com/conda-forge/staged-recipes + - Click "Fork" to create your fork + +4. **Add your recipe**: + ```bash + git clone https://github.com/YOUR_USERNAME/staged-recipes.git + cd staged-recipes + git checkout -b add-chiltepin + cp -r /path/to/chiltepin/recipe recipes/chiltepin + git add recipes/chiltepin + git commit -m "Add chiltepin recipe" + git push origin add-chiltepin + ``` + +5. **Open a Pull Request**: + - Go to your fork on GitHub + - Click "Compare & pull request" + - Fill out the PR template + - Wait for automated tests and reviews + +6. **After Merge**: + - A `chiltepin-feedstock` repository will be automatically created + - You'll be added as a maintainer + - Future version updates are done via PRs to the feedstock + +## Updating the Recipe for New Versions + +**After your package is on conda-forge**, version updates are done in the **feedstock repository**, not this repo. + +### First-time submission: +- Use `recipe/meta.yaml` from this repo +- Submit to `conda-forge/staged-recipes` + +### For all future releases: +1. Release new version and publish to PyPI +2. Fork `https://github.com/conda-forge/chiltepin-feedstock` +3. Update `recipe/meta.yaml` in the **feedstock**: + - Update `version` variable + - Update `sha256` hash for new tarball + - Reset `build: number:` to 0 +4. Submit PR to the feedstock repository +5. Automated bots will help test and merge + +**Note:** The `meta.yaml` in this repo serves as a reference/template. +After initial submission, the feedstock repository becomes the source of truth. + +## Resources + +- [conda-forge documentation](https://conda-forge.org/docs/) +- [Contributing packages guide](https://conda-forge.org/docs/maintainer/adding_pkgs.html) +- [Example recipes](https://github.com/conda-forge/staged-recipes/tree/main/recipes) diff --git a/recipe/meta.yaml b/recipe/meta.yaml new file mode 100644 index 00000000..1ea0e4c0 --- /dev/null +++ b/recipe/meta.yaml @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: Apache-2.0 + +{% set name = "chiltepin" %} +{% set version = "0.1.0" %} + +package: + name: {{ name|lower }} + version: {{ version }} + +source: + url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/chiltepin-{{ version }}.tar.gz + sha256: + +build: + number: 0 + script: {{ PYTHON }} -m pip install . -vv + noarch: python + entry_points: + - chiltepin = chiltepin.cli:main + +requirements: + host: + - python >=3.10 + - pip + - setuptools >=61.0 + run: + - python >=3.10 + - globus-compute-sdk >=4.5.0,<4.7.0 + - globus-compute-endpoint >=4.5.0,<4.7.0 + - parsl >=2026.1.5 + +test: + imports: + - chiltepin + - chiltepin.cli + - chiltepin.configure + - chiltepin.endpoint + - chiltepin.tasks + - chiltepin.data + commands: + - chiltepin --help + +about: + home: https://github.com/NOAA-GSL/ExascaleWorkflowSandbox + license: Apache-2.0 + license_file: LICENSE + summary: Federated NWP Workflow Tools + description: | + Chiltepin is a Python library for exploring federated workflow capabilities + using Parsl and Globus Compute. It provides tools and demonstrations for + implementing distributed scientific workflows on HPC systems. + doc_url: https://exascaleworkflowsandbox.readthedocs.io + dev_url: https://github.com/NOAA-GSL/ExascaleWorkflowSandbox + +extra: + recipe-maintainers: + # Add your GitHub username(s) here + - christopherwharrop-noaa diff --git a/src/chiltepin/__init__.py b/src/chiltepin/__init__.py index e69de29b..98813136 100644 --- a/src/chiltepin/__init__.py +++ b/src/chiltepin/__init__.py @@ -0,0 +1 @@ +# SPDX-License-Identifier: Apache-2.0 diff --git a/src/chiltepin/cli.py b/src/chiltepin/cli.py index d8a4a45b..52317c2a 100644 --- a/src/chiltepin/cli.py +++ b/src/chiltepin/cli.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import chiltepin.endpoint as endpoint diff --git a/src/chiltepin/configure.py b/src/chiltepin/configure.py index 4bcb9c00..8fcc09d3 100644 --- a/src/chiltepin/configure.py +++ b/src/chiltepin/configure.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from pathlib import Path from typing import Any, Dict, List, Optional diff --git a/src/chiltepin/data.py b/src/chiltepin/data.py index 7e35db7c..a853f12e 100644 --- a/src/chiltepin/data.py +++ b/src/chiltepin/data.py @@ -1,4 +1,60 @@ -from concurrent.futures import Future +# SPDX-License-Identifier: Apache-2.0 + +"""Data transfer and management for Chiltepin workflows. + +This module provides specialized tasks for transferring and deleting data between +Globus data transfer endpoints. These tasks integrate seamlessly with Parsl workflows. + +Available Tasks +--------------- +- :func:`transfer_task`: Transfer files/directories between Globus data endpoints +- :func:`delete_task`: Delete files/directories from Globus data endpoints + +Available Functions +------------------- +- :func:`transfer`: Synchronous data transfer using Globus +- :func:`delete`: Synchronous data deletion using Globus + +For comprehensive usage examples and best practices, see the :doc:`data` documentation. + +Examples +-------- +Stage, process, and cleanup data in a workflow:: + + from chiltepin.data import transfer_task, delete_task + from chiltepin.tasks import python_task + + @python_task + def process_data(input_path): + # Process the data + with open(input_path, 'r') as f: + data = f.read() + return len(data) + + # Stage data to compute resource + stage = transfer_task( + src_ep="my-laptop", + dst_ep="hpc-scratch", + src_path="/data/input.dat", + dst_path="/scratch/input.dat", + executor="local" + ) + + # Process the staged data (waits for stage via inputs parameter) + result = process_data("/scratch/input.dat", executor="compute", inputs=[stage]) + + # Clean up after processing completes (waits for result via inputs) + cleanup = delete_task( + src_ep="hpc-scratch", + src_path="/scratch/input.dat", + executor="local", + inputs=[result] + ) + + # Get final result + output = result.result() +""" + from typing import Optional from globus_sdk import TransferClient @@ -17,7 +73,6 @@ def transfer_task( polling_interval: int = 30, client: Optional[TransferClient] = None, recursive: bool = False, - dependencies: Optional[Future] = None, ): """Transfer data asynchronously in a Parsl task @@ -53,16 +108,11 @@ def transfer_task( client: TransferClient | None Transfer client to use for submitting the transfers. If None, one - will be retreived via the login process. If a login has already been + will be retrieved via the login process. If a login has already been performed, no login flow prompts will be issued. recursive: bool Whether or not a recursive transfer should be performed - - dependencies: Future | None - Future to wait for completion before running the transfer task. If - None, the transfer will be run at the next available scheduling - opportunity """ # Run the transfer (executes in remote Parsl worker) completed = transfer( # pragma: no cover @@ -86,7 +136,6 @@ def delete_task( polling_interval: int = 30, client: Optional[TransferClient] = None, recursive: bool = False, - dependencies: Optional[Future] = None, ): """Delete data asynchronously in a Parsl task @@ -114,17 +163,12 @@ def delete_task( client: TransferClient | None Transfer client to use for submitting the deletion. If None, one - will be retreived via the login process. If a login has already been + will be retrieved via the login process. If a login has already been performed, no login flow prompts will be issued. NOTE: Yes, deletion is done using a TransferClient. recursive: bool Whether or not a recursive deletion should be performed - - dependencies: Future | None - Future to wait for completion before running the deletion task. If - None, the deletion will be run at the next available scheduling - opportunity """ # Run the deletion (executes in remote Parsl worker) completed = delete( # pragma: no cover @@ -181,7 +225,7 @@ def transfer( client: TransferClient | None Transfer client to use for submitting the transfers. If None, one - will be retreived via the login process. If a login has already been + will be retrieved via the login process. If a login has already been performed, no login flow prompts will be issued. recursive: bool @@ -276,7 +320,7 @@ def delete( client: TransferClient | None Transfer client to use for submitting the deletion. If None, one - will be retreived via the login process. If a login has already been + will be retrieved via the login process. If a login has already been performed, no login flow prompts will be issued. NOTE: Yes, deletion is done using a TransferClient. diff --git a/src/chiltepin/endpoint.py b/src/chiltepin/endpoint.py index 2c46611b..3a8f6b86 100644 --- a/src/chiltepin/endpoint.py +++ b/src/chiltepin/endpoint.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pathlib import platform diff --git a/src/chiltepin/tasks.py b/src/chiltepin/tasks.py index 201b1f4e..af4f9593 100644 --- a/src/chiltepin/tasks.py +++ b/src/chiltepin/tasks.py @@ -1,3 +1,43 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""Task decorators for Chiltepin workflows. + +This module provides decorators for defining workflow tasks that can be executed +on configured resources. Tasks are the fundamental units of work in Chiltepin workflows. + +Available Decorators +-------------------- +- :func:`python_task`: Execute Python functions as workflow tasks +- :func:`bash_task`: Execute shell commands as workflow tasks +- :func:`join_task`: Coordinate multiple tasks without blocking workflow execution + +For comprehensive usage examples and best practices, see the :doc:`tasks` documentation. + +Examples +-------- +Define a simple Python task:: + + from chiltepin.tasks import python_task + + @python_task + def add_numbers(a, b): + return a + b + + # Execute on a specific resource + result = add_numbers(5, 3, executor="compute").result() + +Define a bash task:: + + from chiltepin.tasks import bash_task + + @bash_task + def list_files(directory): + return f"ls -la {directory}" + + # Returns exit code (0 = success) + exit_code = list_files("/tmp", executor="compute").result() +""" + from functools import wraps from inspect import Parameter, signature from typing import Callable diff --git a/tests/__init__.py b/tests/__init__.py index e69de29b..98813136 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1 @@ +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/configs/docker.yaml b/tests/configs/docker.yaml index 1a8172e9..b6cede25 100644 --- a/tests/configs/docker.yaml +++ b/tests/configs/docker.yaml @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + mpi: mpi: True max_mpi_apps: 2 diff --git a/tests/configs/hera.yaml b/tests/configs/hera.yaml index d341e7e6..6f8a0ebe 100644 --- a/tests/configs/hera.yaml +++ b/tests/configs/hera.yaml @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + mpi: mpi: True max_mpi_apps: 2 diff --git a/tests/configs/hercules.yaml b/tests/configs/hercules.yaml index 19d679d2..82bcc010 100644 --- a/tests/configs/hercules.yaml +++ b/tests/configs/hercules.yaml @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + mpi: mpi: True max_mpi_apps: 2 diff --git a/tests/configs/ursa.yaml b/tests/configs/ursa.yaml index 3235e8a3..27f00a26 100644 --- a/tests/configs/ursa.yaml +++ b/tests/configs/ursa.yaml @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + mpi: mpi: True max_mpi_apps: 2 diff --git a/tests/conftest.py b/tests/conftest.py index c0713d2a..18e6afec 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest diff --git a/tests/delete-endpoint.py b/tests/delete-endpoint.py index d5244fdd..87a82b2f 100644 --- a/tests/delete-endpoint.py +++ b/tests/delete-endpoint.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import globus_sdk.services.compute import chiltepin.endpoint as endpoint diff --git a/tests/mpi_hello.f90 b/tests/mpi_hello.f90 index e9a0ef07..990d40e4 100644 --- a/tests/mpi_hello.f90 +++ b/tests/mpi_hello.f90 @@ -1,3 +1,5 @@ +! SPDX-License-Identifier: Apache-2.0 + program mpi_hello use iso_fortran_env, only : output_unit diff --git a/tests/mpi_pi.f90 b/tests/mpi_pi.f90 index d8e0b6ea..9dc5134b 100644 --- a/tests/mpi_pi.f90 +++ b/tests/mpi_pi.f90 @@ -1,3 +1,5 @@ +! SPDX-License-Identifier: Apache-2.0 + program calculatePi use mpi diff --git a/tests/test_cli.py b/tests/test_cli.py index e15e2b8c..8519ae81 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + """Tests for the chiltepin CLI. IMPORTANT: This test suite uses mocking to prevent ANY actual endpoint operations diff --git a/tests/test_configure.py b/tests/test_configure.py index fdf67d6a..4c5bd668 100644 --- a/tests/test_configure.py +++ b/tests/test_configure.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + """Tests for chiltepin.configure module. This test suite validates that YAML configurations produce the expected diff --git a/tests/test_data.py b/tests/test_data.py index d5e8d6c1..5aacd1b7 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging import pathlib import uuid @@ -54,7 +56,9 @@ def config(config_file): logger_handler() -def test_data_transfer_task(config): +def test_data_task_basic(config): + """Test basic transfer_task and delete_task functionality.""" + # Transfer a file transfer_future = data.transfer_task( "chiltepin-test-mercury", "chiltepin-test-ursa", @@ -65,12 +69,10 @@ def test_data_transfer_task(config): executor=["local"], client=config["client"], ) - completed = transfer_future.result() - - assert completed is True - + transfer_completed = transfer_future.result() + assert transfer_completed is True -def test_data_delete_task(config): + # Delete the transferred file delete_future = data.delete_task( "chiltepin-test-ursa", config["unique_dst"], @@ -79,13 +81,14 @@ def test_data_delete_task(config): executor=["local"], client=config["client"], ) - completed = delete_future.result() + delete_completed = delete_future.result() + assert delete_completed is True - assert completed is True - -def test_data_transfer(config): - completed = data.transfer( +def test_data_sync_basic(config): + """Test basic synchronous transfer and delete functionality.""" + # Transfer a file (synchronous) + transfer_completed = data.transfer( "chiltepin-test-mercury", "chiltepin-test-ursa", "1MB.from_mercury", @@ -93,17 +96,16 @@ def test_data_transfer(config): timeout=120, polling_interval=10, ) - assert completed is True - + assert transfer_completed is True -def test_data_delete(config): - completed = data.delete( + # Delete the transferred file (synchronous) + delete_completed = data.delete( "chiltepin-test-ursa", config["unique_dst"], timeout=120, polling_interval=10, ) - assert completed is True + assert delete_completed is True def test_data_transfer_with_bad_src_ep(config): diff --git a/tests/test_endpoint.py b/tests/test_endpoint.py index 77e72a18..c9203096 100644 --- a/tests/test_endpoint.py +++ b/tests/test_endpoint.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pathlib import shutil diff --git a/tests/test_globus_compute_hello.py b/tests/test_globus_compute_hello.py index fba31049..cc1aed4f 100644 --- a/tests/test_globus_compute_hello.py +++ b/tests/test_globus_compute_hello.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging import os.path import pathlib diff --git a/tests/test_globus_compute_mpi.py b/tests/test_globus_compute_mpi.py index 9ce496f5..3f8033e7 100644 --- a/tests/test_globus_compute_mpi.py +++ b/tests/test_globus_compute_mpi.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging import os.path import pathlib diff --git a/tests/test_parsl_hello.py b/tests/test_parsl_hello.py index f6aafcab..1578db93 100755 --- a/tests/test_parsl_hello.py +++ b/tests/test_parsl_hello.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging import os.path import pathlib diff --git a/tests/test_parsl_mpi.py b/tests/test_parsl_mpi.py index 145360a6..486fa4c9 100644 --- a/tests/test_parsl_mpi.py +++ b/tests/test_parsl_mpi.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging import os import os.path diff --git a/tests/test_tasks.py b/tests/test_tasks.py index 36754ecc..66a210c7 100644 --- a/tests/test_tasks.py +++ b/tests/test_tasks.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + """Tests for chiltepin.tasks module. This test suite validates that task decorators work correctly for both @@ -38,8 +40,8 @@ def parsl_config(): local_config = { "test-local": { "provider": "localhost", - "cores_per_node": 1, - "max_workers_per_node": 1, + "cores_per_node": 2, + "max_workers_per_node": 2, "environment": [f"export PYTHONPATH=${{PYTHONPATH}}:{project_root}"], } } @@ -147,6 +149,48 @@ def get_list(n): result = future.result() assert result == [0, 1, 2, 3, 4] + def test_python_task_with_inputs_dependency(self, parsl_config): + """Test that python_task automatically supports inputs parameter for dependencies.""" + import time + + @python_task + def first_task(): + import time + + # Sleep to ensure we can detect if second starts early + time.sleep(2) + return "first" + + @python_task + def second_task(): + return "second" + + # Time everything in pytest process using monotonic clock + start = time.monotonic() + + # Create first task + first = first_task(executor=["test-local"]) + + # Second task depends on first via inputs parameter (added by Parsl decorator) + # With 2 workers, second COULD start immediately if dependency wasn't enforced + second = second_task(executor=["test-local"], inputs=[first]) + + # Wait for second to complete (which should wait for first due to inputs=[first]) + # Calling second.result() first ensures we're measuring the actual wait time + second_result = second.result() + elapsed = time.monotonic() - start + + # Verify dependency was enforced by checking elapsed time + # If inputs=[first] works, second can't complete until first finishes its 2s sleep + assert elapsed >= 2.0, ( + f"Second task completed in {elapsed:.1f}s, suggesting it didn't wait for first task (expected β‰₯2s)" + ) + + # Verify first also completed successfully + first_result = first.result() + assert first_result == "first" + assert second_result == "second" + def test_python_task_with_dict_return(self, parsl_config): """Test python_task returning a dictionary."""