diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 0116f9ac5..0ba3cee61 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -4,8 +4,8 @@ # Enforces that a member of the @awslabs/sagemaker-hyperpod-dev team for HyperPod lifecycle scripts # They must approve any PRs that modify files under either base-config directory, # including all nested subdirectories and files. -/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config @awslabs/hyperpod-lcs-dev -/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config @awslabs/hyperpod-lcs-dev +/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config @awslabs/hyperpod-lcs-dev +/architectures/sagemaker-hyperpod-eks/LifecycleScripts/base-config @awslabs/hyperpod-lcs-dev # The blog PR gate runs with secrets via pull_request_target. Only blog # maintainers may modify it — this prevents a malicious edit from turning diff --git a/.github/ISSUE_TEMPLATE/200-bug-report.yml b/.github/ISSUE_TEMPLATE/200-bug-report.yml index ccdcec151..54c742654 100644 --- a/.github/ISSUE_TEMPLATE/200-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/200-bug-report.yml @@ -6,7 +6,7 @@ body: - type: markdown attributes: value: > - #### Before submitting a bug report, please make sure you have searched [existing issues](https://github.com/awslabs/awsome-distributed-training/issues). + #### Before submitting a bug report, please make sure you have searched [existing issues](https://github.com/awslabs/awsome-distributed-ai/issues). **IMPORTANT:** Please redact any access keys, secret keys, session tokens, diff --git a/.github/ISSUE_TEMPLATE/300-ci-failure.yml b/.github/ISSUE_TEMPLATE/300-ci-failure.yml index 238d74bbe..56cd8a2b8 100644 --- a/.github/ISSUE_TEMPLATE/300-ci-failure.yml +++ b/.github/ISSUE_TEMPLATE/300-ci-failure.yml @@ -24,7 +24,7 @@ body: attributes: label: GitHub Actions Run URL description: Link to the failing GitHub Actions run. - placeholder: https://github.com/awslabs/awsome-distributed-training/actions/runs/... + placeholder: https://github.com/awslabs/awsome-distributed-ai/actions/runs/... validations: required: false - type: dropdown diff --git a/.github/ISSUE_TEMPLATE/400-feature-request.yml b/.github/ISSUE_TEMPLATE/400-feature-request.yml index ff836d0d0..f27bc2db3 100644 --- a/.github/ISSUE_TEMPLATE/400-feature-request.yml +++ b/.github/ISSUE_TEMPLATE/400-feature-request.yml @@ -8,7 +8,7 @@ body: value: > Thank you for suggesting a feature! For major changes that affect the project's architecture or direction, please consider using the - [RFC template](https://github.com/awslabs/awsome-distributed-training/issues/new?template=600-RFC.yml) instead. + [RFC template](https://github.com/awslabs/awsome-distributed-ai/issues/new?template=600-RFC.yml) instead. - type: textarea id: description attributes: diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 0f77d212b..2dc5a72fd 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -31,7 +31,7 @@ ``` -3.test_cases/ +examples/ └── / # e.g. pytorch, megatron, jax └── / # e.g. picotron, FSDP, megatron-lm └── / # e.g. SmolLM-1.7B (may be omitted for single-model cases) @@ -48,7 +48,7 @@ ## Checklist -- [ ] I have read the [contributing guidelines](https://github.com/awslabs/awsome-distributed-training/blob/main/CONTRIBUTING.md). +- [ ] I have read the [contributing guidelines](https://github.com/awslabs/awsome-distributed-ai/blob/main/CONTRIBUTING.md). - [ ] I am working against the latest `main` branch. - [ ] I have searched existing open and recently merged PRs to confirm this is not a duplicate. - [ ] The contribution is self-contained with documentation and scripts. diff --git a/.github/workflows/fsdp-eks-regression.yml b/.github/workflows/fsdp-eks-regression.yml index e5629c649..c74363940 100644 --- a/.github/workflows/fsdp-eks-regression.yml +++ b/.github/workflows/fsdp-eks-regression.yml @@ -3,7 +3,7 @@ name: FSDP Regression Test (EKS) on: pull_request: paths: - - '3.test_cases/pytorch/FSDP/**' + - 'examples/training/fsdp/**' workflow_dispatch: @@ -26,7 +26,7 @@ jobs: - name: Set env vars run: | BUILD_ID="${{ github.run_id }}" - FSDP_DIR="$(pwd)/${BUILD_ID}/3.test_cases/pytorch/FSDP" + FSDP_DIR="$(pwd)/${BUILD_ID}/examples/training/fsdp" # Set instance specific variables if [[ "${{ matrix.cluster }}" == "p5-eks" ]]; then diff --git a/.github/workflows/fsdp-regression-test-container.yml b/.github/workflows/fsdp-regression-test-container.yml index 23176ccc8..b44309f26 100644 --- a/.github/workflows/fsdp-regression-test-container.yml +++ b/.github/workflows/fsdp-regression-test-container.yml @@ -6,10 +6,10 @@ on: push: branches: [ "main" ] paths: - - '3.test_cases/pytorch/FSDP/**' + - 'examples/training/fsdp/**' pull_request: paths: - - '3.test_cases/pytorch/FSDP/**' + - 'examples/training/fsdp/**' workflow_dispatch: @@ -95,7 +95,7 @@ jobs: id: build working-directory: source-code run: | - FSDP_DIR="${{ env.REMOTE_BUILD_PATH }}/3.test_cases/pytorch/FSDP" + FSDP_DIR="${{ env.REMOTE_BUILD_PATH }}/examples/training/fsdp" ENROOT_IMAGE="${{ env.BASE_PATH }}/enroot-images/fsdp-${{ github.run_id }}-${{ matrix.cluster }}.sqsh" echo "Building FSDP image on cluster..." @@ -208,7 +208,7 @@ EOF env: HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | - FSDP_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP" + FSDP_DIR="${{ env.REMOTE_TEST_PATH }}/examples/training/fsdp" SBATCH_FILE="slurm/${{ matrix.model_config }}-training.sbatch" TMP_SBATCH="slurm/regression_test_${{ matrix.model_config }}_${{ matrix.cluster }}.sbatch" diff --git a/.github/workflows/fsdp-regression-test-venv.yml b/.github/workflows/fsdp-regression-test-venv.yml index 25f99a3dc..c33f8da58 100644 --- a/.github/workflows/fsdp-regression-test-venv.yml +++ b/.github/workflows/fsdp-regression-test-venv.yml @@ -6,10 +6,10 @@ on: push: branches: [ "main" ] paths: - - '3.test_cases/pytorch/FSDP/**' + - 'examples/training/fsdp/**' pull_request: paths: - - '3.test_cases/pytorch/FSDP/**' + - 'examples/training/fsdp/**' workflow_dispatch: @@ -107,7 +107,7 @@ EOF - name: Create Virtual Environment on Cluster run: | - FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP/slurm" + FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/examples/training/fsdp/slurm" echo "Creating virtual environment on cluster..." ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \ @@ -123,7 +123,7 @@ EOF env: HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | - FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP/slurm" + FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/examples/training/fsdp/slurm" SBATCH_FILE="${{ matrix.model_config }}-training.sbatch" TMP_SBATCH="regression_test_${{ matrix.model_config }}.sbatch" diff --git a/.github/workflows/megatron-ci-slurm.yaml b/.github/workflows/megatron-ci-slurm.yaml index c341f6841..2f2ea62c6 100644 --- a/.github/workflows/megatron-ci-slurm.yaml +++ b/.github/workflows/megatron-ci-slurm.yaml @@ -6,10 +6,10 @@ on: push: branches: [ "main" ] paths: - - '3.test_cases/megatron/megatron-lm/**' + - 'examples/training/megatron-lm/**' pull_request: paths: - - '3.test_cases/megatron/megatron-lm/**' + - 'examples/training/megatron-lm/**' workflow_dispatch: @@ -95,7 +95,7 @@ jobs: id: build working-directory: source-code run: | - MEGATRON_DIR="${{ env.REMOTE_BUILD_PATH }}/3.test_cases/megatron/megatron-lm" + MEGATRON_DIR="${{ env.REMOTE_BUILD_PATH }}/examples/training/megatron-lm" ENROOT_IMAGE="${{ env.BASE_PATH }}/enroot-images/megatron-${{ github.run_id }}-${{ matrix.cluster }}.sqsh" echo "Building Megatron-LM image on cluster..." diff --git a/.github/workflows/pr-review-and-slurm-test.yml b/.github/workflows/pr-review-and-slurm-test.yml index e957bb204..eae28834b 100644 --- a/.github/workflows/pr-review-and-slurm-test.yml +++ b/.github/workflows/pr-review-and-slurm-test.yml @@ -10,7 +10,7 @@ env: AWS_REGION: us-east-1 SLURM_HOST: p5en.smml.aiml.aws.dev SLURM_USER: ghactions - RESULTS_PATH: /fsx/agents/pr-reviews/awsome-distributed-training + RESULTS_PATH: /fsx/agents/pr-reviews/awsome-distributed-ai AWS_ROLE_ARN: arn:aws:iam::159553542841:role/awslabs-AOSH-GitHubActionsRole permissions: diff --git a/.gitignore b/.gitignore index 51347b993..52fd330cd 100644 --- a/.gitignore +++ b/.gitignore @@ -39,7 +39,7 @@ downloads/ eggs/ .eggs/ lib/ -!4.validation_and_observability/2.gpu-cluster-healthcheck/lib/ +!validation_and_observability/gpu-cluster-healthcheck/lib/ lib64/ parts/ sdist/ diff --git a/README.md b/README.md index e7a94d4af..656de23d2 100644 --- a/README.md +++ b/README.md @@ -5,14 +5,14 @@ This repository contains reference architectures and test cases for distributed The major components of this directory are: ``` -├── 1.architectures/ # CloudFormation templates for reference architectures -├── 2.ami_and_containers/ # Scripts to create AMIs and container images -├── 3.test_cases/ # Reference test cases and/or benchmark scripts -├── 4.validation_and_observability/# Tools to measure performance or troubleshoot +├── architectures/ # CloudFormation templates for reference architectures +├── ami_and_containers/ # Scripts to create AMIs and container images +├── examples/ # Reference test cases and/or benchmark scripts +├── validation_and_observability/# Tools to measure performance or troubleshoot └── micro-benchmarks/ # Micro-benchmarks (NCCL, NCCOM, NVSHMEM, etc.) ``` -**NOTE**: The architectures are designed to work with the S3 bucket and VPC created using reference templates `1.architectures/0.common/` and `1.architectures/1.vpc_network/`. _You're strongly recommended to deploy these two templates **before** deploying any of the reference architectures._ +**NOTE**: The architectures are designed to work with the S3 bucket and VPC created using reference templates `architectures/common/` and `architectures/vpc_network/`. _You're strongly recommended to deploy these two templates **before** deploying any of the reference architectures._ ## 0. Workshops @@ -28,64 +28,69 @@ You can follow the workshops below to train models on AWS. Each contains example Posts about distributed ML training on AWS are published at . The Hugo source lives on the [`content`](https://github.com/awslabs/awsome-distributed/tree/content) branch. -Blog content is editorially curated by AWS authors. Code samples in this repo (`1.architectures/`, `3.test_cases/`, etc.) accept external contributions as usual — see [CONTRIBUTING.md](./CONTRIBUTING.md). +Blog content is editorially curated by AWS authors. Code samples in this repo (`architectures/`, `examples/`, etc.) accept external contributions as usual — see [CONTRIBUTING.md](./CONTRIBUTING.md). ## 1. Architectures -Architectures are located in `1.architectures` and consist of utilities and service-related architectures. +Architectures are located in `architectures` and consist of utilities and service-related architectures. | Name | Category | Usage | | ------------------------------------------------------------------------------ | -------- | ---------------------------------------------------- | -| [`0.common`](./1.architectures/0.common) | Storage | Common resources (S3 bucket, event notifications) | -| [`1.vpc_network`](./1.architectures/1.vpc_network) | Network | Create a VPC with subnets and required resources | -| [`2.aws-parallelcluster`](./1.architectures/2.aws-parallelcluster) | Compute | Cluster templates for GPU & custom silicon training | -| [`3.aws-batch`](./1.architectures/3.aws-batch) | Compute | AWS Batch template for distributed training | -| [`4.amazon-eks`](./1.architectures/4.amazon-eks) | Compute | Manifest files to train with Amazon EKS | -| [`5.sagemaker-hyperpod`](./1.architectures/5.sagemaker-hyperpod) | Compute | SageMaker HyperPod template for distributed training | -| [`6.ldap_server`](./1.architectures/6.ldap_server) | Identity | LDAP server for multi-user cluster access | -| [`7.sagemaker-hyperpod-eks`](./1.architectures/7.sagemaker-hyperpod-eks) | Compute | SageMaker HyperPod with EKS orchestration | -| [`8.accounting-database`](./1.architectures/8.accounting-database) | Tooling | Accounting database for job tracking | +| [`common`](./architectures/common) | Storage | Common resources (S3 bucket, event notifications) | +| [`vpc_network`](./architectures/vpc_network) | Network | Create a VPC with subnets and required resources | +| [`aws-parallelcluster`](./architectures/aws-parallelcluster) | Compute | Cluster templates for GPU & custom silicon training | +| [`aws-batch`](./architectures/aws-batch) | Compute | AWS Batch template for distributed training | +| [`amazon-eks`](./architectures/amazon-eks) | Compute | Manifest files to train with Amazon EKS | +| [`sagemaker-hyperpod-slurm`](./architectures/sagemaker-hyperpod-slurm) | Compute | SageMaker HyperPod template for distributed training | +| [`ldap_server`](./architectures/ldap_server) | Identity | LDAP server for multi-user cluster access | +| [`sagemaker-hyperpod-eks`](./architectures/sagemaker-hyperpod-eks) | Compute | SageMaker HyperPod with EKS orchestration | +| [`accounting-database`](./architectures/accounting-database) | Tooling | Accounting database for job tracking | | [`aws-pcs`](./architectures/aws-pcs) | Compute | AWS Parallel Computing Service templates with Slurm scheduler | -You will also find [documentation](./1.architectures/efa-cheatsheet.md) for EFA and the recommended environment variables. +You will also find [documentation](./architectures/efa-cheatsheet.md) for EFA and the recommended environment variables. ## 2. Custom Amazon Machine Images Custom machine images can be built using [Packer](https://www.packer.io) for AWS ParallelCluster, Amazon EKS and plain EC2. These images are based on Ansible roles and playbooks. -## 3. Test Cases +## 3. Examples -Test cases are organized under `3.test_cases/` by framework (e.g. `pytorch/`, `megatron/`, `jax/`). Within each framework, directories are named after the training library or method (e.g. `picotron/`, `FSDP/`, `megatron-lm/`). +Examples live under `examples/` and are organized along two axes: -Each test case follows this general structure: +- **`examples/training/`** and **`examples/inference/`** — *framework-centric*. The training or inference engine is the subject, and model variants underneath illustrate it (e.g. `training/fsdp/`, `training/megatron-lm/`, `training/nemo/`). Swapping the model gives "the same example with a different model." +- **`examples/use-cases/`** — *use-case-centric*. A specific model or task is the subject and the framework is incidental (e.g. `use-cases/detr-finetune/`, `use-cases/vjepa2/`). Swapping the framework would still leave a recognizable demo. + +Each example follows this general structure: ``` -3.test_cases/ -└── / # e.g. pytorch, megatron, jax - └── / # e.g. picotron, FSDP, megatron-lm - └── / # e.g. SmolLM-1.7B (may be omitted for single-model cases) - ├── Dockerfile # Container / environment setup - ├── README.md - ├── slurm/ # Slurm-specific launch scripts - ├── kubernetes/ # Kubernetes manifests - └── hyperpod-eks/ # HyperPod EKS instructions +examples/ +├── training/ # framework-centric training/fine-tuning engines +│ └── / # e.g. fsdp, deepspeed, megatron-lm, nemo, trl +│ └── / # e.g. llama3 (may be omitted for single-model cases) +│ ├── Dockerfile # Container / environment setup +│ ├── README.md +│ ├── slurm/ # Slurm-specific launch scripts +│ └── kubernetes/ # Kubernetes manifests +├── inference/ # framework-centric inference engines (vllm, …) +└── use-cases/ # use-case-centric end-to-end demos + └── / # e.g. detr-finetune, esm2-hyperpod ``` -The top-level directory for each test case contains general introduction and environment setup (Dockerfiles, training scripts, configs), while subdirectories provide service-specific launch instructions. +The top-level directory for each example contains general introduction and environment setup (Dockerfiles, training scripts, configs), while subdirectories provide service-specific launch instructions. -Browse [`3.test_cases/`](./3.test_cases) to see the full list of available frameworks and test cases. +Browse [`examples/`](./examples) to see the full list of frameworks, engines, and use cases. ## 4. Validation and Observability -Utility scripts and tools for validating your environment and monitoring training jobs are under `4.validation_and_observability/`. +Utility scripts and tools for validating your environment and monitoring training jobs are under `validation_and_observability/`. | Name | Comments | | ----------------------------------------------------------------------------------------------- | --------------------------------------------------------------- | -| [`1.pytorch-env-validation`](./4.validation_and_observability/1.pytorch-env-validation) | Validates your PyTorch environment | -| [`2.gpu-cluster-healthcheck`](./4.validation_and_observability/2.gpu-cluster-healthcheck) | GPU cluster health checks | -| [`3.efa-node-exporter`](./4.validation_and_observability/3.efa-node-exporter) | Node exporter with Amazon EFA monitoring modules | -| [`4.prometheus-grafana`](./4.validation_and_observability/4.prometheus-grafana) | Monitoring for SageMaker HyperPod and EKS GPU clusters | -| [`5.nsight`](./4.validation_and_observability/5.nsight) | Shows how to run Nvidia Nsight Systems to profile your workload | +| [`pytorch-env-validation`](./validation_and_observability/pytorch-env-validation) | Validates your PyTorch environment | +| [`gpu-cluster-healthcheck`](./validation_and_observability/gpu-cluster-healthcheck) | GPU cluster health checks | +| [`efa-node-exporter`](./validation_and_observability/efa-node-exporter) | Node exporter with Amazon EFA monitoring modules | +| [`prometheus-grafana`](./validation_and_observability/prometheus-grafana) | Monitoring for SageMaker HyperPod and EKS GPU clusters | +| [`nsight`](./validation_and_observability/nsight) | Shows how to run Nvidia Nsight Systems to profile your workload | ## 5. Micro-benchmarks diff --git a/2.ami_and_containers/1.amazon_machine_image/Makefile b/ami_and_containers/amazon_machine_image/Makefile similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/Makefile rename to ami_and_containers/amazon_machine_image/Makefile diff --git a/2.ami_and_containers/1.amazon_machine_image/README.md b/ami_and_containers/amazon_machine_image/README.md similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/README.md rename to ami_and_containers/amazon_machine_image/README.md diff --git a/2.ami_and_containers/1.amazon_machine_image/inventory/group_vars/all.yml b/ami_and_containers/amazon_machine_image/inventory/group_vars/all.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/inventory/group_vars/all.yml rename to ami_and_containers/amazon_machine_image/inventory/group_vars/all.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/inventory/hosts b/ami_and_containers/amazon_machine_image/inventory/hosts similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/inventory/hosts rename to ami_and_containers/amazon_machine_image/inventory/hosts diff --git a/2.ami_and_containers/1.amazon_machine_image/packer-ami.pkr.hcl b/ami_and_containers/amazon_machine_image/packer-ami.pkr.hcl similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/packer-ami.pkr.hcl rename to ami_and_containers/amazon_machine_image/packer-ami.pkr.hcl diff --git a/2.ami_and_containers/1.amazon_machine_image/playbook-dlami-gpu.yml b/ami_and_containers/amazon_machine_image/playbook-dlami-gpu.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/playbook-dlami-gpu.yml rename to ami_and_containers/amazon_machine_image/playbook-dlami-gpu.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/playbook-dlami-neuron.yml b/ami_and_containers/amazon_machine_image/playbook-dlami-neuron.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/playbook-dlami-neuron.yml rename to ami_and_containers/amazon_machine_image/playbook-dlami-neuron.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/playbook-eks-gpu.yml b/ami_and_containers/amazon_machine_image/playbook-eks-gpu.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/playbook-eks-gpu.yml rename to ami_and_containers/amazon_machine_image/playbook-eks-gpu.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/playbook-pcluster-cpu.yml b/ami_and_containers/amazon_machine_image/playbook-pcluster-cpu.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/playbook-pcluster-cpu.yml rename to ami_and_containers/amazon_machine_image/playbook-pcluster-cpu.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/playbook-pcluster-gpu.yml b/ami_and_containers/amazon_machine_image/playbook-pcluster-gpu.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/playbook-pcluster-gpu.yml rename to ami_and_containers/amazon_machine_image/playbook-pcluster-gpu.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/playbook-pcluster-neuron.yml b/ami_and_containers/amazon_machine_image/playbook-pcluster-neuron.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/playbook-pcluster-neuron.yml rename to ami_and_containers/amazon_machine_image/playbook-pcluster-neuron.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/converge.yml b/ami_and_containers/amazon_machine_image/roles/aws_cliv2/molecule/default/converge.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/converge.yml rename to ami_and_containers/amazon_machine_image/roles/aws_cliv2/molecule/default/converge.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/molecule.yml b/ami_and_containers/amazon_machine_image/roles/aws_cliv2/molecule/default/molecule.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/molecule.yml rename to ami_and_containers/amazon_machine_image/roles/aws_cliv2/molecule/default/molecule.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/prepare.yml b/ami_and_containers/amazon_machine_image/roles/aws_cliv2/molecule/default/prepare.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/prepare.yml rename to ami_and_containers/amazon_machine_image/roles/aws_cliv2/molecule/default/prepare.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/verify.yml b/ami_and_containers/amazon_machine_image/roles/aws_cliv2/molecule/default/verify.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/molecule/default/verify.yml rename to ami_and_containers/amazon_machine_image/roles/aws_cliv2/molecule/default/verify.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/aws_cliv2/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/aws_cliv2/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/aws_cliv2/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/aws_efa/defaults/main.yml b/ami_and_containers/amazon_machine_image/roles/aws_efa/defaults/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/aws_efa/defaults/main.yml rename to ami_and_containers/amazon_machine_image/roles/aws_efa/defaults/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/aws_efa/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/aws_efa/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/aws_efa/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/aws_efa/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/aws_efa_ofi/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/aws_efa_ofi/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/aws_efa_ofi/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/aws_efa_ofi/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/aws_lustre/defaults/main.yml b/ami_and_containers/amazon_machine_image/roles/aws_lustre/defaults/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/aws_lustre/defaults/main.yml rename to ami_and_containers/amazon_machine_image/roles/aws_lustre/defaults/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/aws_lustre/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/aws_lustre/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/aws_lustre/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/aws_lustre/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/base/defaults/main.yml b/ami_and_containers/amazon_machine_image/roles/base/defaults/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/base/defaults/main.yml rename to ami_and_containers/amazon_machine_image/roles/base/defaults/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/base/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/base/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/base/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/base/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/docker/defaults/main.yml b/ami_and_containers/amazon_machine_image/roles/docker/defaults/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/docker/defaults/main.yml rename to ami_and_containers/amazon_machine_image/roles/docker/defaults/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/docker/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/docker/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/docker/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/docker/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/neuron_driver/defaults/main.yml b/ami_and_containers/amazon_machine_image/roles/neuron_driver/defaults/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/neuron_driver/defaults/main.yml rename to ami_and_containers/amazon_machine_image/roles/neuron_driver/defaults/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/neuron_driver/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/neuron_driver/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/neuron_driver/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/neuron_driver/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_cuda/defaults/main.yml b/ami_and_containers/amazon_machine_image/roles/nvidia_cuda/defaults/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/nvidia_cuda/defaults/main.yml rename to ami_and_containers/amazon_machine_image/roles/nvidia_cuda/defaults/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_cuda/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/nvidia_cuda/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/nvidia_cuda/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/nvidia_cuda/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_docker/defaults/main.yml b/ami_and_containers/amazon_machine_image/roles/nvidia_docker/defaults/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/nvidia_docker/defaults/main.yml rename to ami_and_containers/amazon_machine_image/roles/nvidia_docker/defaults/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_docker/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/nvidia_docker/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/nvidia_docker/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/nvidia_docker/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_driver/defaults/main.yml b/ami_and_containers/amazon_machine_image/roles/nvidia_driver/defaults/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/nvidia_driver/defaults/main.yml rename to ami_and_containers/amazon_machine_image/roles/nvidia_driver/defaults/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_driver/files/nvidia-persistenced-override.service b/ami_and_containers/amazon_machine_image/roles/nvidia_driver/files/nvidia-persistenced-override.service similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/nvidia_driver/files/nvidia-persistenced-override.service rename to ami_and_containers/amazon_machine_image/roles/nvidia_driver/files/nvidia-persistenced-override.service diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_driver/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/nvidia_driver/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/nvidia_driver/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/nvidia_driver/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_enroot_pyxis/defaults/main.yml b/ami_and_containers/amazon_machine_image/roles/nvidia_enroot_pyxis/defaults/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/nvidia_enroot_pyxis/defaults/main.yml rename to ami_and_containers/amazon_machine_image/roles/nvidia_enroot_pyxis/defaults/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_enroot_pyxis/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/nvidia_enroot_pyxis/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/nvidia_enroot_pyxis/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/nvidia_enroot_pyxis/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_enroot_pyxis/templates/enroot.conf b/ami_and_containers/amazon_machine_image/roles/nvidia_enroot_pyxis/templates/enroot.conf similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/nvidia_enroot_pyxis/templates/enroot.conf rename to ami_and_containers/amazon_machine_image/roles/nvidia_enroot_pyxis/templates/enroot.conf diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_gdrcopy/defaults/main.yml b/ami_and_containers/amazon_machine_image/roles/nvidia_gdrcopy/defaults/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/nvidia_gdrcopy/defaults/main.yml rename to ami_and_containers/amazon_machine_image/roles/nvidia_gdrcopy/defaults/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_gdrcopy/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/nvidia_gdrcopy/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/nvidia_gdrcopy/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/nvidia_gdrcopy/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_nccl/defaults/main.yml b/ami_and_containers/amazon_machine_image/roles/nvidia_nccl/defaults/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/nvidia_nccl/defaults/main.yml rename to ami_and_containers/amazon_machine_image/roles/nvidia_nccl/defaults/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/nvidia_nccl/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/nvidia_nccl/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/nvidia_nccl/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/nvidia_nccl/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/observability/defaults/main.yml b/ami_and_containers/amazon_machine_image/roles/observability/defaults/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/observability/defaults/main.yml rename to ami_and_containers/amazon_machine_image/roles/observability/defaults/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/observability/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/observability/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/observability/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/observability/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/packages/defaults/main.yml b/ami_and_containers/amazon_machine_image/roles/packages/defaults/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/packages/defaults/main.yml rename to ami_and_containers/amazon_machine_image/roles/packages/defaults/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/packages/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/packages/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/packages/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/packages/tasks/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/pytorch_neuron/defaults/main.yml b/ami_and_containers/amazon_machine_image/roles/pytorch_neuron/defaults/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/pytorch_neuron/defaults/main.yml rename to ami_and_containers/amazon_machine_image/roles/pytorch_neuron/defaults/main.yml diff --git a/2.ami_and_containers/1.amazon_machine_image/roles/pytorch_neuron/tasks/main.yml b/ami_and_containers/amazon_machine_image/roles/pytorch_neuron/tasks/main.yml similarity index 100% rename from 2.ami_and_containers/1.amazon_machine_image/roles/pytorch_neuron/tasks/main.yml rename to ami_and_containers/amazon_machine_image/roles/pytorch_neuron/tasks/main.yml diff --git a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile b/ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile similarity index 99% rename from 2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile rename to ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile index dbfec4b1f..210eef613 100644 --- a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile +++ b/ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile @@ -29,7 +29,7 @@ ENV AWS_OFI_NCCL_VERSION=1.12.1-aws ENV NCCL_TESTS_VERSION=master ## Uncomment below when this Dockerfile builds a container image with efa-installer<1.29.1 and -# nccl>=2.19.0. See https://github.com/awslabs/awsome-distributed-training/tree/main/1.architectures/efa-cheatsheet.md +# nccl>=2.19.0. See https://github.com/awslabs/awsome-distributed-ai/tree/main/architectures/efa-cheatsheet.md #ENV FI_EFA_SET_CUDA_SYNC_MEMOPS=0 RUN apt-get update -y diff --git a/2.ami_and_containers/containers/pytorch/README.md b/ami_and_containers/containers/pytorch/README.md similarity index 100% rename from 2.ami_and_containers/containers/pytorch/README.md rename to ami_and_containers/containers/pytorch/README.md diff --git a/2.ami_and_containers/3.pcluster_create_dlami/01.dlami-ub2004-base-gpu.yaml b/ami_and_containers/pcluster_create_dlami/01.dlami-ub2004-base-gpu.yaml similarity index 100% rename from 2.ami_and_containers/3.pcluster_create_dlami/01.dlami-ub2004-base-gpu.yaml rename to ami_and_containers/pcluster_create_dlami/01.dlami-ub2004-base-gpu.yaml diff --git a/2.ami_and_containers/3.pcluster_create_dlami/02.dlami-ub2004-pytorch-gpu.yaml b/ami_and_containers/pcluster_create_dlami/02.dlami-ub2004-pytorch-gpu.yaml similarity index 100% rename from 2.ami_and_containers/3.pcluster_create_dlami/02.dlami-ub2004-pytorch-gpu.yaml rename to ami_and_containers/pcluster_create_dlami/02.dlami-ub2004-pytorch-gpu.yaml diff --git a/2.ami_and_containers/3.pcluster_create_dlami/README.md b/ami_and_containers/pcluster_create_dlami/README.md similarity index 100% rename from 2.ami_and_containers/3.pcluster_create_dlami/README.md rename to ami_and_containers/pcluster_create_dlami/README.md diff --git a/2.ami_and_containers/tools/ec2md/README.md b/ami_and_containers/tools/ec2md/README.md similarity index 100% rename from 2.ami_and_containers/tools/ec2md/README.md rename to ami_and_containers/tools/ec2md/README.md diff --git a/2.ami_and_containers/tools/ec2md/ec2md.sh b/ami_and_containers/tools/ec2md/ec2md.sh similarity index 100% rename from 2.ami_and_containers/tools/ec2md/ec2md.sh rename to ami_and_containers/tools/ec2md/ec2md.sh diff --git a/1.architectures/8.accounting-database/README.md b/architectures/accounting-database/README.md similarity index 100% rename from 1.architectures/8.accounting-database/README.md rename to architectures/accounting-database/README.md diff --git a/1.architectures/8.accounting-database/cf_database-accounting.yaml b/architectures/accounting-database/cf_database-accounting.yaml similarity index 100% rename from 1.architectures/8.accounting-database/cf_database-accounting.yaml rename to architectures/accounting-database/cf_database-accounting.yaml diff --git a/1.architectures/4.amazon-eks/README.md b/architectures/amazon-eks/README.md similarity index 97% rename from 1.architectures/4.amazon-eks/README.md rename to architectures/amazon-eks/README.md index d649f1d75..c00ed46a3 100644 --- a/1.architectures/4.amazon-eks/README.md +++ b/architectures/amazon-eks/README.md @@ -1,7 +1,7 @@ # Amazon EKS distributed training architecture -This project provides several reference architectures to run distributed training on Amazon EKS for different use cases using `p4d.24xlarge` instances (you can replace them by `p5` or `trn1`. These examples use [eksctl](eksctl.io) and a cluster manifest to create your specified Amazon EKS cluster. +This project provides several reference architectures to run distributed training on Amazon EKS for different use cases using `p4d.24xlarge` instances (you can replace them by `p5` or `trn1`. These examples use [eksctl](https://eksctl.io) and a cluster manifest to create your specified Amazon EKS cluster. ## 0. Prerequisites @@ -15,7 +15,7 @@ To deploy the architectures you must install the dependencies below. You are adv The following digram shows a common architecture that can be used for distributed model training on EKS. - + The EKS cluster has two nodegroups. A `system` nodegroup is used to run pods like kube-dns, kubeflow training operator, etc. which provide internal cluster-scope services and can run on CPU. A worker nodegroup built with an accelerated instance type is used to run the distributed training workload. diff --git a/1.architectures/4.amazon-eks/eks-g4dn-vpc.yaml b/architectures/amazon-eks/eks-g4dn-vpc.yaml similarity index 100% rename from 1.architectures/4.amazon-eks/eks-g4dn-vpc.yaml rename to architectures/amazon-eks/eks-g4dn-vpc.yaml diff --git a/1.architectures/4.amazon-eks/eks-g4dn.yaml b/architectures/amazon-eks/eks-g4dn.yaml similarity index 100% rename from 1.architectures/4.amazon-eks/eks-g4dn.yaml rename to architectures/amazon-eks/eks-g4dn.yaml diff --git a/1.architectures/4.amazon-eks/eks-g5-node-autorepair.yaml b/architectures/amazon-eks/eks-g5-node-autorepair.yaml similarity index 100% rename from 1.architectures/4.amazon-eks/eks-g5-node-autorepair.yaml rename to architectures/amazon-eks/eks-g5-node-autorepair.yaml diff --git a/1.architectures/4.amazon-eks/eks-p4de-odcr-vpc.yaml b/architectures/amazon-eks/eks-p4de-odcr-vpc.yaml similarity index 100% rename from 1.architectures/4.amazon-eks/eks-p4de-odcr-vpc.yaml rename to architectures/amazon-eks/eks-p4de-odcr-vpc.yaml diff --git a/1.architectures/4.amazon-eks/eks-p4de-odcr.yaml b/architectures/amazon-eks/eks-p4de-odcr.yaml similarity index 100% rename from 1.architectures/4.amazon-eks/eks-p4de-odcr.yaml rename to architectures/amazon-eks/eks-p4de-odcr.yaml diff --git a/1.architectures/4.amazon-eks/eks-p5-capacity-block.yaml b/architectures/amazon-eks/eks-p5-capacity-block.yaml similarity index 100% rename from 1.architectures/4.amazon-eks/eks-p5-capacity-block.yaml rename to architectures/amazon-eks/eks-p5-capacity-block.yaml diff --git a/1.architectures/4.amazon-eks/eks-p5-odcr-vpc.yaml b/architectures/amazon-eks/eks-p5-odcr-vpc.yaml similarity index 100% rename from 1.architectures/4.amazon-eks/eks-p5-odcr-vpc.yaml rename to architectures/amazon-eks/eks-p5-odcr-vpc.yaml diff --git a/1.architectures/3.aws-batch/0.aws-batch-distributed-training-p5.yaml b/architectures/aws-batch/0.aws-batch-distributed-training-p5.yaml similarity index 100% rename from 1.architectures/3.aws-batch/0.aws-batch-distributed-training-p5.yaml rename to architectures/aws-batch/0.aws-batch-distributed-training-p5.yaml diff --git a/1.architectures/3.aws-batch/0.aws-batch-distributed-training.yaml b/architectures/aws-batch/0.aws-batch-distributed-training.yaml similarity index 100% rename from 1.architectures/3.aws-batch/0.aws-batch-distributed-training.yaml rename to architectures/aws-batch/0.aws-batch-distributed-training.yaml diff --git a/1.architectures/3.aws-batch/README.md b/architectures/aws-batch/README.md similarity index 94% rename from 1.architectures/3.aws-batch/README.md rename to architectures/aws-batch/README.md index 78b3ac563..6ea69c608 100644 --- a/1.architectures/3.aws-batch/README.md +++ b/architectures/aws-batch/README.md @@ -14,7 +14,7 @@ This repository provides CloudFormation templates and examples for running distr ## Prerequisites -> **⚠️ Important**: You must deploy the VPC template [`2.vpc-one-az.yaml`](../../1.architectures/1.vpc_network/2.vpc-one-az.yaml) before deploying any Batch template. The Batch templates automatically fetch the EFA Security Group ID and Subnet ID from the VPC template's exported values. +> **⚠️ Important**: You must deploy the VPC template [`2.vpc-one-az.yaml`](../../architectures/vpc_network/2.vpc-one-az.yaml) before deploying any Batch template. The Batch templates automatically fetch the EFA Security Group ID and Subnet ID from the VPC template's exported values. ## Architecture Overview @@ -28,7 +28,7 @@ This architecture consists of the following AWS resources: | **Job Definition** | Template for job execution, references container images | [AWS Docs](https://docs.aws.amazon.com/batch/latest/userguide/job_definitions.html) | | **ECR Container Registry** | Stores Docker container images | [AWS Docs](https://docs.aws.amazon.com/AmazonECR/latest/userguide/what-is-ecr.html) | -AWS Batch Architecture Diagram +AWS Batch Architecture Diagram ## Available Templates @@ -44,7 +44,7 @@ This architecture consists of the following AWS resources: Deploy the standard template with one click: -[
1-Click Deploy 🚀
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-training.s3.amazonaws.com/templates/0.aws-batch-distributed-training.yaml&stackName=AWS-Batch) +[
1-Click Deploy 🚀
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-ai.s3.amazonaws.com/templates/0.aws-batch-distributed-training.yaml&stackName=AWS-Batch) ### Parameters diff --git a/1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml b/architectures/aws-batch/aws-batch-distributed-training-p6.yaml similarity index 100% rename from 1.architectures/3.aws-batch/aws-batch-distributed-training-p6.yaml rename to architectures/aws-batch/aws-batch-distributed-training-p6.yaml diff --git a/1.architectures/2.aws-parallelcluster/.gitignore b/architectures/aws-parallelcluster/.gitignore similarity index 100% rename from 1.architectures/2.aws-parallelcluster/.gitignore rename to architectures/aws-parallelcluster/.gitignore diff --git a/1.architectures/2.aws-parallelcluster/README-full-fledged.md b/architectures/aws-parallelcluster/README-full-fledged.md similarity index 100% rename from 1.architectures/2.aws-parallelcluster/README-full-fledged.md rename to architectures/aws-parallelcluster/README-full-fledged.md diff --git a/1.architectures/2.aws-parallelcluster/README.md b/architectures/aws-parallelcluster/README.md similarity index 96% rename from 1.architectures/2.aws-parallelcluster/README.md rename to architectures/aws-parallelcluster/README.md index 02b986a56..76b0f83ec 100644 --- a/1.architectures/2.aws-parallelcluster/README.md +++ b/architectures/aws-parallelcluster/README.md @@ -9,7 +9,7 @@ This README provides a "vanilla" reference architectures and deployment guide fo ## Architecture -![AWS ParallelCluster diagram](../../0.docs/core-infra-architecture.png) +![AWS ParallelCluster diagram](../../assets/core-infra-architecture.png) The infrastructure consists of the two layers: @@ -55,8 +55,8 @@ These tools are essential for following the deployment steps in this guide and m First, clone the repository and move to this directory: ```bash -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/1.architectures/2.aws-parallelcluster +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/architectures/aws-parallelcluster ``` Then create a directory under home directory to store cluster config files: @@ -185,7 +185,7 @@ To deploy the S3 bucket using our CloudFormation template: 1. Click the button below to launch the CloudFormation stack: -[
1-Click Deploy 🚀
](https://console.aws.amazon.com/cloudformation/home#/stacks/quickcreate?templateUrl=https://awsome-distributed-training.s3.amazonaws.com/templates/0.private-bucket.yaml&stackName=cluster-data-bucket) +[
1-Click Deploy 🚀
](https://console.aws.amazon.com/cloudformation/home#/stacks/quickcreate?templateUrl=https://awsome-distributed-ai.s3.amazonaws.com/templates/0.private-bucket.yaml&stackName=cluster-data-bucket) 2. In the CloudFormation console: - Enter a stack name (e.g., `cluster-data-bucket`) @@ -222,7 +222,7 @@ Please follow the steps below to deploy your resources: 1. Click on this link to deploy to CloudFormation: -[
 1-Click Deploy 🚀 
](https://console.aws.amazon.com/cloudformation/home#/stacks/quickcreate?templateUrl=https://awsome-distributed-training.s3.amazonaws.com/templates/parallelcluster-prerequisites.yaml&stackName=parallelcluster-prerequisites) +[
 1-Click Deploy 🚀 
](https://console.aws.amazon.com/cloudformation/home#/stacks/quickcreate?templateUrl=https://awsome-distributed-ai.s3.amazonaws.com/templates/parallelcluster-prerequisites.yaml&stackName=parallelcluster-prerequisites) > [!IMPORTANT] > When opening the link, you must specify the region and availability zone where your compute resources are located. Be sure to select the correct region and fill out the "Availability Zone configuration for the subnets" field, when you create the stack. @@ -230,9 +230,9 @@ Please follow the steps below to deploy your resources: > [!NOTE] > The above CloudFormation stack uses FSx for Lustre `PERSISTENT_2` deployment type by default. If your selected availability zone doesn't support `PERSISTENT_2` or you specifically need to use `PERSISTENT_1` deployment type, please use the link below instead: -> [
 1-Click Deploy 🚀 
](https://console.aws.amazon.com/cloudformation/home#/stacks/quickcreate?templateUrl=https://awsome-distributed-training.s3.amazonaws.com/templates/parallelcluster-prerequisites-p1.yaml&stackName=parallelcluster-prerequisites) +> [
 1-Click Deploy 🚀 
](https://console.aws.amazon.com/cloudformation/home#/stacks/quickcreate?templateUrl=https://awsome-distributed-ai.s3.amazonaws.com/templates/parallelcluster-prerequisites-p1.yaml&stackName=parallelcluster-prerequisites) -![parallelcluster-prerequisites-cfn](../../0.docs/parallelcluster-prerequisites-cfn.png) +![parallelcluster-prerequisites-cfn](../../assets/parallelcluster-prerequisites-cfn.png) 2. Once the CloudFormation stack deployment is complete, you'll need to export the stack name as an environment variable for future steps: @@ -453,7 +453,7 @@ Once the cluster goes into **CREATE COMPLETE**, we can connect to the head node **SSH** can be used to connect to the cluster from a standard SSH client. This can be configured to use your own key via adding the public key or a new key can be provisioned. ### SSM Connect -![ssm connect](../../0.docs/ssm-connect.png) +![ssm connect](../../assets/ssm-connect.png) You'll need to be authenticated to the AWS account that instance is running in and have permission to launch a SSM session . Once you're connected you'll have access to a terminal on the head node: Now change to `ubuntu` user: @@ -462,7 +462,7 @@ Now change to `ubuntu` user: sudo su - ubuntu ``` -![ssm user connect](../../0.docs/ssm-connect-user.png) +![ssm user connect](../../assets/ssm-connect-user.png) ### SSH access diff --git a/1.architectures/2.aws-parallelcluster/cluster-templates/cluster-vanilla.yaml b/architectures/aws-parallelcluster/cluster-templates/cluster-vanilla.yaml similarity index 100% rename from 1.architectures/2.aws-parallelcluster/cluster-templates/cluster-vanilla.yaml rename to architectures/aws-parallelcluster/cluster-templates/cluster-vanilla.yaml diff --git a/1.architectures/2.aws-parallelcluster/infra-templates/parallelcluster-prerequisites-p1.yaml b/architectures/aws-parallelcluster/infra-templates/parallelcluster-prerequisites-p1.yaml similarity index 100% rename from 1.architectures/2.aws-parallelcluster/infra-templates/parallelcluster-prerequisites-p1.yaml rename to architectures/aws-parallelcluster/infra-templates/parallelcluster-prerequisites-p1.yaml diff --git a/1.architectures/2.aws-parallelcluster/infra-templates/parallelcluster-prerequisites.yaml b/architectures/aws-parallelcluster/infra-templates/parallelcluster-prerequisites.yaml similarity index 100% rename from 1.architectures/2.aws-parallelcluster/infra-templates/parallelcluster-prerequisites.yaml rename to architectures/aws-parallelcluster/infra-templates/parallelcluster-prerequisites.yaml diff --git a/1.architectures/2.aws-parallelcluster/post-install-scripts/install-node-exporter.sh b/architectures/aws-parallelcluster/post-install-scripts/install-node-exporter.sh similarity index 96% rename from 1.architectures/2.aws-parallelcluster/post-install-scripts/install-node-exporter.sh rename to architectures/aws-parallelcluster/post-install-scripts/install-node-exporter.sh index f7d816d87..0fa7305e3 100644 --- a/1.architectures/2.aws-parallelcluster/post-install-scripts/install-node-exporter.sh +++ b/architectures/aws-parallelcluster/post-install-scripts/install-node-exporter.sh @@ -14,7 +14,7 @@ # Sequence: # - Script: s3:///install-node-exporter.sh # # Or use GitHub raw URL directly (no S3 upload required): -# # - Script: https://raw.githubusercontent.com/awslabs/awsome-distributed-training/main/1.architectures/2.aws-parallelcluster/post-install-scripts/install-node-exporter.sh +# # - Script: https://raw.githubusercontent.com/awslabs/awsome-distributed-ai/main/architectures/aws-parallelcluster/post-install-scripts/install-node-exporter.sh # # Environment variables: # NODE_EXPORTER_VERSION: Version to install (default: 1.9.1) diff --git a/1.architectures/2.aws-parallelcluster/tips-and-tricks.md b/architectures/aws-parallelcluster/tips-and-tricks.md similarity index 100% rename from 1.architectures/2.aws-parallelcluster/tips-and-tricks.md rename to architectures/aws-parallelcluster/tips-and-tricks.md diff --git a/1.architectures/2.aws-parallelcluster/troubleshooting-guide.md b/architectures/aws-parallelcluster/troubleshooting-guide.md similarity index 100% rename from 1.architectures/2.aws-parallelcluster/troubleshooting-guide.md rename to architectures/aws-parallelcluster/troubleshooting-guide.md diff --git a/1.architectures/2.aws-parallelcluster/utils/create_config.sh b/architectures/aws-parallelcluster/utils/create_config.sh similarity index 100% rename from 1.architectures/2.aws-parallelcluster/utils/create_config.sh rename to architectures/aws-parallelcluster/utils/create_config.sh diff --git a/1.architectures/2.aws-parallelcluster/utils/easy-ssh.sh b/architectures/aws-parallelcluster/utils/easy-ssh.sh similarity index 100% rename from 1.architectures/2.aws-parallelcluster/utils/easy-ssh.sh rename to architectures/aws-parallelcluster/utils/easy-ssh.sh diff --git a/1.architectures/2.aws-parallelcluster/utils/pcluster-fetch-config.sh b/architectures/aws-parallelcluster/utils/pcluster-fetch-config.sh similarity index 100% rename from 1.architectures/2.aws-parallelcluster/utils/pcluster-fetch-config.sh rename to architectures/aws-parallelcluster/utils/pcluster-fetch-config.sh diff --git a/architectures/aws-pcs/README.md b/architectures/aws-pcs/README.md index 033206050..b173e3412 100644 --- a/architectures/aws-pcs/README.md +++ b/architectures/aws-pcs/README.md @@ -309,12 +309,12 @@ For more details, see the [Connect to Cluster](https://catalog.workshops.aws/ml- ### LDAP User Management For centralized user management across the cluster, see: -- [LDAP Server Setup Guide](../6.ldap_server/README.md) - Deploy and configure OpenLDAP for cluster-wide user authentication +- [LDAP Server Setup Guide](../ldap_server/README.md) - Deploy and configure OpenLDAP for cluster-wide user authentication ### Observability Stack For monitoring and observability, see: -- [Prometheus & Grafana Setup](../../4.validation_and_observability/4.prometheus-grafana/README.md) - Deploy monitoring stack with DCGM metrics +- [Prometheus & Grafana Setup](../../validation_and_observability/prometheus-grafana/README.md) - Deploy monitoring stack with DCGM metrics - [AWS ParallelCluster Monitoring](https://github.com/aws-samples/aws-parallelcluster-monitoring) - Comprehensive monitoring solution with Prometheus, Grafana, and custom dashboards for HPC clusters --- diff --git a/1.architectures/0.common/0.private-bucket.yaml b/architectures/common/0.private-bucket.yaml similarity index 100% rename from 1.architectures/0.common/0.private-bucket.yaml rename to architectures/common/0.private-bucket.yaml diff --git a/1.architectures/0.common/README.md b/architectures/common/README.md similarity index 89% rename from 1.architectures/0.common/README.md rename to architectures/common/README.md index da2f44ca4..8f523c029 100644 --- a/1.architectures/0.common/README.md +++ b/architectures/common/README.md @@ -4,7 +4,7 @@ This template creates a S3 Bucket with all public access disabled. To deploy it, click the link below: -[
 1-Click Deploy 🚀 
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-training.s3.amazonaws.com/templates/0.private-bucket.yaml&stackName=ML-S3) +[
 1-Click Deploy 🚀 
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-ai.s3.amazonaws.com/templates/0.private-bucket.yaml&stackName=ML-S3) ## HyperPod cluster status change / node health event notifications diff --git a/1.architectures/0.common/hyperpod-event-bridge-email.yaml b/architectures/common/hyperpod-event-bridge-email.yaml similarity index 100% rename from 1.architectures/0.common/hyperpod-event-bridge-email.yaml rename to architectures/common/hyperpod-event-bridge-email.yaml diff --git a/1.architectures/efa-cheatsheet.md b/architectures/efa-cheatsheet.md similarity index 100% rename from 1.architectures/efa-cheatsheet.md rename to architectures/efa-cheatsheet.md diff --git a/1.architectures/6.ldap_server/README.md b/architectures/ldap_server/README.md similarity index 100% rename from 1.architectures/6.ldap_server/README.md rename to architectures/ldap_server/README.md diff --git a/1.architectures/6.ldap_server/cf_ldap_server.yaml b/architectures/ldap_server/cf_ldap_server.yaml similarity index 100% rename from 1.architectures/6.ldap_server/cf_ldap_server.yaml rename to architectures/ldap_server/cf_ldap_server.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh b/architectures/sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh rename to architectures/sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create_main.sh b/architectures/sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create_main.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create_main.sh rename to architectures/sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create_main.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/README-manual-steps.md b/architectures/sagemaker-hyperpod-eks/README-manual-steps.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/README-manual-steps.md rename to architectures/sagemaker-hyperpod-eks/README-manual-steps.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/README.md b/architectures/sagemaker-hyperpod-eks/README.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/README.md rename to architectures/sagemaker-hyperpod-eks/README.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/README.md b/architectures/sagemaker-hyperpod-eks/automate-smhp-eks/README.md similarity index 83% rename from 1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/README.md rename to architectures/sagemaker-hyperpod-eks/automate-smhp-eks/README.md index e39864965..0a750f3ba 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/README.md +++ b/architectures/sagemaker-hyperpod-eks/automate-smhp-eks/README.md @@ -9,7 +9,7 @@ If you plan to use this script to **deploy the same infrastructure multiple time ## What the Helper Script Does -In this section, we provide you with a [helper script](https://github.com/awslabs/awsome-distributed-training/blob/main/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/hyperpod-eks-cluster-creation.sh) that will walk you through the following: +In this section, we provide you with a [helper script](https://github.com/awslabs/awsome-distributed-ai/blob/main/architectures/sagemaker-hyperpod-eks/automate-smhp-eks/hyperpod-eks-cluster-creation.sh) that will walk you through the following: 1. Installing the right packages in your environment (e.g. aws cli, helm, kubectl, eksctl) - Optionally creating a [SageMaker Studio Code Editor](https://docs.aws.amazon.com/sagemaker/latest/dg/code-editor.html) environment for you to use. @@ -18,7 +18,7 @@ In this section, we provide you with a [helper script](https://github.com/awslab - A Private Subnet in the availability zone where your accelerated compute capacity resides. - A Security Group configured for FSx for Lustre and Elastic Fabric Adapter (EFA) communication. - An EKS Cluster to use as the control interface for your HyperPod cluster. - - An S3 Bucket with with the [on_create.sh](https://github.com/awslabs/awsome-distributed-training/blob/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh) lifecycle script auto-uploaded. + - An S3 Bucket with with the [on_create.sh](https://github.com/awslabs/awsome-distributed-ai/blob/main/architectures/sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh) lifecycle script auto-uploaded. - An IAM Role which allows the HyperPod cluster to run and communicate with other AWS resource on your behalf. 3. Configuring and deploying your HyperPod cluster with the option to add multiple instance groups. 4. Configuring your EKS cluster, including: @@ -35,8 +35,8 @@ The following diagram depicts the high-level workflow of how the helper script d 1. When you run the helper script, you will be prompted to answer a series of questions in order to dynamically configure the cloud resources to fit your needs. 2. The helper script references an AWS managed S3 bucket to pull down CloudFormation stack templates. -3. Optionally, the helper script will use the [`sagemaker-studio-stack.yaml`](https://github.com/awslabs/awsome-distributed-training/blob/main/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/sagemaker-studio-stack.yaml) file to deploy a [SageMaker Studio Code Editor](https://docs.aws.amazon.com/sagemaker/latest/dg/code-editor.html) environment for you. -4. The helper script will then use the [`main-stack.yaml`](https://github.com/awslabs/awsome-distributed-training/blob/main/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml) file to deploy the remaining components of the workshop infrastructure. This includes a series of nested Cloudformation stacks that will be pulled from the AWS managed S3 bucket. +3. Optionally, the helper script will use the [`sagemaker-studio-stack.yaml`](https://github.com/awslabs/awsome-distributed-ai/blob/main/architectures/sagemaker-hyperpod-eks/cfn-templates/sagemaker-studio-stack.yaml) file to deploy a [SageMaker Studio Code Editor](https://docs.aws.amazon.com/sagemaker/latest/dg/code-editor.html) environment for you. +4. The helper script will then use the [`main-stack.yaml`](https://github.com/awslabs/awsome-distributed-ai/blob/main/architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml) file to deploy the remaining components of the workshop infrastructure. This includes a series of nested Cloudformation stacks that will be pulled from the AWS managed S3 bucket. 5. AWS CloudFormation will deploy the configured workshop infrastructure, including VPC, Private Subnet, Security Group, S3 Bucket, IAM Role, and EKS Cluster resources. See [Deploy HyperPod Infrastructure using CloudFormation](./cfn-templates/README.md) for details. 6. Finally the helper script walks you through a series of prompts to configure and deploy a HyperPod cluster using the AWS CLI. @@ -49,13 +49,13 @@ Prerequisites - Bash shell environment - AWS account with appropriate permissions -To run the helper script, clone the [awsome-distributed-training](https://github.com/awslabs/awsome-distributed-training) repository, or directly download the script (as shown below) and run it on your local (Linux/macOS) terminal. It is recommended to run this script from your own directory, rather than running `cd` into the cloned repository directory. Make sure you've configured the AWS CLI with the IAM principal (user or role) you wish to use before running the script. +To run the helper script, clone the [awsome-distributed-ai](https://github.com/awslabs/awsome-distributed-ai) repository, or directly download the script (as shown below) and run it on your local (Linux/macOS) terminal. It is recommended to run this script from your own directory, rather than running `cd` into the cloned repository directory. Make sure you've configured the AWS CLI with the IAM principal (user or role) you wish to use before running the script. ```bash # Clone the repository mkdir hyperpod-eks && cd hyperpod-eks -curl -O https://raw.githubusercontent.com/awslabs/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/hyperpod-eks-cluster-creation.sh +curl -O https://raw.githubusercontent.com/awslabs/awsome-distributed-ai/refs/heads/main/architectures/sagemaker-hyperpod-eks/automate-smhp-eks/hyperpod-eks-cluster-creation.sh # Make the script executable chmod +x hyperpod-eks-cluster-creation.sh @@ -65,7 +65,7 @@ chmod +x hyperpod-eks-cluster-creation.sh ``` Executing the script will result in various prompts showing up in your terminal similar to the demonstration below: -![SageMaker Hyperpod Cluster Automation Demo](/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/media/automate-smhp-eks-demo.gif) +![SageMaker Hyperpod Cluster Automation Demo](/architectures/sagemaker-hyperpod-eks/automate-smhp-eks/media/automate-smhp-eks-demo.gif) These prompts will ask you various questions about how you want to configure your cloud resources. There are intuitive defaults in place to help expedite resource creation. These defaults are indicated using square brackets `[ ]`. @@ -86,7 +86,7 @@ Voila! Once you get through the script, you should see a directory that looks li ```bash hyperpod/ |-- hyperpod-eks-cluster-creation.sh # This script! -|-- awsome-distributed-training/ # The script clones the repo with all the Lifecycle Scripts! +|-- awsome-distributed-ai/ # The script clones the repo with all the Lifecycle Scripts! |-- cluster-config.json # Cluster configuration generated by this script. Run `vi cluster-config.json` to make changes |-- env_vars # Your environment variables used to create the SMHP cluster `-- ... diff --git a/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/automate-eks-cluster-creation.sh b/architectures/sagemaker-hyperpod-eks/automate-smhp-eks/automate-eks-cluster-creation.sh similarity index 99% rename from 1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/automate-eks-cluster-creation.sh rename to architectures/sagemaker-hyperpod-eks/automate-smhp-eks/automate-eks-cluster-creation.sh index 573d866b1..87e9cf0e5 100755 --- a/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/automate-eks-cluster-creation.sh +++ b/architectures/sagemaker-hyperpod-eks/automate-smhp-eks/automate-eks-cluster-creation.sh @@ -286,9 +286,9 @@ install_helm() { fi } -# Function to clone the awsome-distributed-training repository +# Function to clone the awsome-distributed-ai repository clone_adt() { - REPO_NAME="awsome-distributed-training" + REPO_NAME="awsome-distributed-ai" if [ -d "$REPO_NAME" ]; then echo -e "${YELLOW}⚠️ The directory '$REPO_NAME' already exists.${NC}" echo -e "${GREEN}Do you want to remove it and clone again? (yes/no): ${NC}" @@ -297,14 +297,14 @@ clone_adt() { echo -e "${YELLOW}Removing existing directory...${NC}" rm -rf "$REPO_NAME" echo -e "${BLUE}Cloning repository...${NC}" - git clone --depth=1 https://github.com/awslabs/awsome-distributed-training/ + git clone --depth=1 https://github.com/awslabs/awsome-distributed-ai/ echo -e "${GREEN}✅ Repository cloned successfully${NC}" else echo -e "${BLUE}Using existing directory...${NC}" fi else echo -e "${BLUE}Cloning repository $REPO_NAME...${NC}" - git clone --depth=1 https://github.com/awslabs/awsome-distributed-training/ + git clone --depth=1 https://github.com/awslabs/awsome-distributed-ai/ echo -e "${GREEN}✅ Repository cloned successfully${NC}" fi } @@ -464,7 +464,7 @@ check_and_prompt_env_vars() { # Function to setup environment variables setup_env_vars() { echo -e "${BLUE}=== Setting Up Environment Variables ===${NC}" - echo -e "${GREEN}Cloning awsome-distributed-training${NC}" + echo -e "${GREEN}Cloning awsome-distributed-ai${NC}" clone_adt # Clear env_vars from previous runs @@ -484,7 +484,7 @@ setup_env_vars() { echo -e "${YELLOW}Generating new environment variables...${NC}" generate_env_vars() { - ./awsome-distributed-training/1.architectures/7.sagemaker-hyperpod-eks/create_config.sh + ./awsome-distributed-ai/architectures/sagemaker-hyperpod-eks/create_config.sh # bash create_config.sh } @@ -521,7 +521,7 @@ setup_env_vars() { setup_lifecycle_scripts() { echo -e "${BLUE}=== Setting Up Lifecycle Scripts ===${NC}" - cd awsome-distributed-training/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/ + cd awsome-distributed-ai/architectures/sagemaker-hyperpod-eks/LifecycleScripts/base-config/ echo -e "${BLUE}Uploading your lifecycle scripts to S3 bucket ${YELLOW}${BUCKET}${NC}" diff --git a/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/hyperpod-eks-cluster-creation.sh b/architectures/sagemaker-hyperpod-eks/automate-smhp-eks/hyperpod-eks-cluster-creation.sh similarity index 99% rename from 1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/hyperpod-eks-cluster-creation.sh rename to architectures/sagemaker-hyperpod-eks/automate-smhp-eks/hyperpod-eks-cluster-creation.sh index f2723ab60..d39913e2b 100755 --- a/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/hyperpod-eks-cluster-creation.sh +++ b/architectures/sagemaker-hyperpod-eks/automate-smhp-eks/hyperpod-eks-cluster-creation.sh @@ -352,23 +352,23 @@ get_input() { echo "${input:-$default}" } -# Function to clone the awsome-distributed-training repository +# Function to clone the awsome-distributed-ai repository clone_adt() { - REPO_NAME="awsome-distributed-training" + REPO_NAME="awsome-distributed-ai" if [ -d "$REPO_NAME" ]; then echo -e "${YELLOW}⚠️ The directory '$REPO_NAME' already exists.${NC}" if get_yes_no "Do you want to remove it and clone again?" "n"; then echo -e "${YELLOW}Removing existing directory...${NC}" rm -rf "$REPO_NAME" echo -e "${BLUE}Cloning repository...${NC}" - git clone --depth=1 https://github.com/awslabs/awsome-distributed-training/ + git clone --depth=1 https://github.com/awslabs/awsome-distributed-ai/ echo -e "${GREEN}✅ Repository cloned successfully${NC}" else echo -e "${BLUE}Using existing directory...${NC}" fi else echo -e "${BLUE}Cloning repository $REPO_NAME...${NC}" - git clone --depth=1 https://github.com/awslabs/awsome-distributed-training/ + git clone --depth=1 https://github.com/awslabs/awsome-distributed-ai/ echo -e "${GREEN}✅ Repository cloned successfully${NC}" fi } @@ -402,7 +402,7 @@ unset_env_vars() { # Function to setup environment variables setup_env_vars() { echo -e "${BLUE}=== Setting Up Environment Variables ===${NC}" - echo -e "${GREEN}Cloning awsome-distributed-training${NC}" + echo -e "${GREEN}Cloning awsome-distributed-ai${NC}" clone_adt export STACK_ID=${STACK_NAME:-hyperpod-eks-full-stack} @@ -414,7 +414,7 @@ setup_env_vars() { echo -e "${YELLOW}Generating new environment variables...${NC}" generate_env_vars() { - ./awsome-distributed-training/1.architectures/7.sagemaker-hyperpod-eks/create_config.sh + ./awsome-distributed-ai/architectures/sagemaker-hyperpod-eks/create_config.sh } # Capture stdout + stderr diff --git a/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/media/automate-smhp-eks-demo.gif b/architectures/sagemaker-hyperpod-eks/automate-smhp-eks/media/automate-smhp-eks-demo.gif similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/media/automate-smhp-eks-demo.gif rename to architectures/sagemaker-hyperpod-eks/automate-smhp-eks/media/automate-smhp-eks-demo.gif diff --git a/1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/media/helper-script.png b/architectures/sagemaker-hyperpod-eks/automate-smhp-eks/media/helper-script.png similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/automate-smhp-eks/media/helper-script.png rename to architectures/sagemaker-hyperpod-eks/automate-smhp-eks/media/helper-script.png diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/README.md b/architectures/sagemaker-hyperpod-eks/cfn-templates/README.md similarity index 99% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/README.md rename to architectures/sagemaker-hyperpod-eks/cfn-templates/README.md index e20797414..c2f9dfe8a 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/README.md +++ b/architectures/sagemaker-hyperpod-eks/cfn-templates/README.md @@ -202,7 +202,7 @@ Note: If you opt to disable the S3BucketStack, please use the S3BucketName param
LifeCycleScriptStack -This stack deploys an AWS Lambda function that creates a [default lifecycle script](https://github.com/awslabs/awsome-distributed-training/blob/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh) and stores it in the referenced S3 bucket. +This stack deploys an AWS Lambda function that creates a [default lifecycle script](https://github.com/awslabs/awsome-distributed-ai/blob/main/architectures/sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh) and stores it in the referenced S3 bucket. diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/.gitignore b/architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/.gitignore similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/.gitignore rename to architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/.gitignore diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/Dockerfile b/architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/Dockerfile similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/Dockerfile rename to architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/Dockerfile diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/README.md b/architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/README.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/README.md rename to architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/README.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/build-layer.sh b/architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/build-layer.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/build-layer.sh rename to architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/build-layer.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/deploy.sh b/architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/deploy.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/deploy.sh rename to architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/deploy.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/lambda_function/lambda_function.py b/architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/lambda_function/lambda_function.py similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/lambda_function/lambda_function.py rename to architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/lambda_function/lambda_function.py diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/lambda_function/requirements.txt b/architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/lambda_function/requirements.txt similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/lambda_function/requirements.txt rename to architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/lambda_function/requirements.txt diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/package-function.sh b/architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/package-function.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/package-function.sh rename to architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/package-function.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/run-docker-build.sh b/architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/run-docker-build.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/run-docker-build.sh rename to architectures/sagemaker-hyperpod-eks/cfn-templates/helm-chart-injector/run-docker-build.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/hyperpod-eks-full-stack.yaml b/architectures/sagemaker-hyperpod-eks/cfn-templates/hyperpod-eks-full-stack.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/hyperpod-eks-full-stack.yaml rename to architectures/sagemaker-hyperpod-eks/cfn-templates/hyperpod-eks-full-stack.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stack-modules.png b/architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stack-modules.png similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stack-modules.png rename to architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stack-modules.png diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/eks-cluster-stack.yaml b/architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/eks-cluster-stack.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/eks-cluster-stack.yaml rename to architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/eks-cluster-stack.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/helm-chart-stack.yaml b/architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/helm-chart-stack.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/helm-chart-stack.yaml rename to architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/helm-chart-stack.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/hyperpod-cluster-stack.yaml b/architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/hyperpod-cluster-stack.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/hyperpod-cluster-stack.yaml rename to architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/hyperpod-cluster-stack.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/lifecycle-script-stack.yaml b/architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/lifecycle-script-stack.yaml similarity index 98% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/lifecycle-script-stack.yaml rename to architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/lifecycle-script-stack.yaml index 8a7a20dd6..9d24f870e 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/lifecycle-script-stack.yaml +++ b/architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/lifecycle-script-stack.yaml @@ -48,7 +48,7 @@ Resources: Environment: Variables: BUCKET_NAME: !Ref S3BucketName - GITHUB_FOLDER_URL: 'https://github.com/awslabs/awsome-distributed-training/tree/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config' + GITHUB_FOLDER_URL: 'https://github.com/awslabs/awsome-distributed-ai/tree/main/architectures/sagemaker-hyperpod-eks/LifecycleScripts/base-config' Handler: index.lambda_handler Role: !GetAtt S3CustomResourceRole.Arn Runtime: python3.12 diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml b/architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml rename to architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/private-subnet-stack.yaml b/architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/private-subnet-stack.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/private-subnet-stack.yaml rename to architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/private-subnet-stack.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/s3-bucket-stack.yaml b/architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/s3-bucket-stack.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/s3-bucket-stack.yaml rename to architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/s3-bucket-stack.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/s3-endpoint-stack.yaml b/architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/s3-endpoint-stack.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/s3-endpoint-stack.yaml rename to architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/s3-endpoint-stack.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/sagemaker-iam-role-stack.yaml b/architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/sagemaker-iam-role-stack.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/sagemaker-iam-role-stack.yaml rename to architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/sagemaker-iam-role-stack.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/security-group-stack.yaml b/architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/security-group-stack.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/security-group-stack.yaml rename to architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/security-group-stack.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/vpc-stack.yaml b/architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/vpc-stack.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/vpc-stack.yaml rename to architectures/sagemaker-hyperpod-eks/cfn-templates/nested-stacks/vpc-stack.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/sagemaker-studio-fsx-stack.yaml b/architectures/sagemaker-hyperpod-eks/cfn-templates/sagemaker-studio-fsx-stack.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/sagemaker-studio-fsx-stack.yaml rename to architectures/sagemaker-hyperpod-eks/cfn-templates/sagemaker-studio-fsx-stack.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/sagemaker-studio-stack.yaml b/architectures/sagemaker-hyperpod-eks/cfn-templates/sagemaker-studio-stack.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/sagemaker-studio-stack.yaml rename to architectures/sagemaker-hyperpod-eks/cfn-templates/sagemaker-studio-stack.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cluster-topology/README.md b/architectures/sagemaker-hyperpod-eks/cluster-topology/README.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cluster-topology/README.md rename to architectures/sagemaker-hyperpod-eks/cluster-topology/README.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cluster-topology/visualize_topology.sh b/architectures/sagemaker-hyperpod-eks/cluster-topology/visualize_topology.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/cluster-topology/visualize_topology.sh rename to architectures/sagemaker-hyperpod-eks/cluster-topology/visualize_topology.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/create_config.sh b/architectures/sagemaker-hyperpod-eks/create_config.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/create_config.sh rename to architectures/sagemaker-hyperpod-eks/create_config.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/.gitignore b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/.gitignore similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/.gitignore rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/.gitignore diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/README.md b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/README.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/README.md rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/README.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/docs/COMPATIBILITY-ANALYSIS.md b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/docs/COMPATIBILITY-ANALYSIS.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/docs/COMPATIBILITY-ANALYSIS.md rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/docs/COMPATIBILITY-ANALYSIS.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/docs/DEPLOYMENT-GUIDE.md b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/docs/DEPLOYMENT-GUIDE.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/docs/DEPLOYMENT-GUIDE.md rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/docs/DEPLOYMENT-GUIDE.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/docs/TROUBLESHOOTING.md b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/docs/TROUBLESHOOTING.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/docs/TROUBLESHOOTING.md rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/docs/TROUBLESHOOTING.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/00-discover-cluster.sh b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/00-discover-cluster.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/00-discover-cluster.sh rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/00-discover-cluster.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/01-prepare-cluster.sh b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/01-prepare-cluster.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/01-prepare-cluster.sh rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/01-prepare-cluster.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/02-install-gpu-operator.sh b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/02-install-gpu-operator.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/02-install-gpu-operator.sh rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/02-install-gpu-operator.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/03-register-nvca.sh b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/03-register-nvca.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/03-register-nvca.sh rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/03-register-nvca.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/04-validate-setup.sh b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/04-validate-setup.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/04-validate-setup.sh rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/infra/scripts/04-validate-setup.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf-config.env.template b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf-config.env.template similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf-config.env.template rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf-config.env.template diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/network-policy-patch.yaml b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/network-policy-patch.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/network-policy-patch.yaml rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/network-policy-patch.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/Dockerfile b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/Dockerfile similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/Dockerfile rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/Dockerfile diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/deploy.sh b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/deploy.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/deploy.sh rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/deploy.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/requirements.txt b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/requirements.txt similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/requirements.txt rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/requirements.txt diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/server.py b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/server.py similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/server.py rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/nvcf/sample-function/server.py diff --git a/1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/tests/validate-cluster.sh b/architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/tests/validate-cluster.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/nvidia-cloud-functions/tests/validate-cluster.sh rename to architectures/sagemaker-hyperpod-eks/nvidia-cloud-functions/tests/validate-cluster.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.gitignore b/architectures/sagemaker-hyperpod-eks/slinky-slurm/.gitignore similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.gitignore rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/.gitignore diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/bash-testing/SKILL.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/bash-testing/SKILL.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/bash-testing/SKILL.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/bash-testing/SKILL.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/build-slurm-image/SKILL.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/build-slurm-image/SKILL.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/build-slurm-image/SKILL.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/build-slurm-image/SKILL.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/deploy-infrastructure/SKILL.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/deploy-infrastructure/SKILL.md similarity index 98% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/deploy-infrastructure/SKILL.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/deploy-infrastructure/SKILL.md index 0c702621d..06c1ccf4e 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/deploy-infrastructure/SKILL.md +++ b/architectures/sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/deploy-infrastructure/SKILL.md @@ -181,7 +181,7 @@ kubectl get nodes -o wide |---------|-------|-----| | Stack creation fails with capacity error | AZ doesn't have capacity for the instance type | Try a different `--az-id` | | `ROLLBACK_COMPLETE` status | CFN template parameter issue | Check CloudFormation events: `aws cloudformation describe-stack-events --stack-name ` | -| Terraform plan fails | Missing terraform-modules directory | Ensure the full `awsome-distributed-training` repo is cloned, not just the slinky-slurm subdirectory | +| Terraform plan fails | Missing terraform-modules directory | Ensure the full `awsome-distributed-ai` repo is cloned, not just the slinky-slurm subdirectory | | `env_vars.sh` not created | Script failed before output extraction | Check the script output for errors and re-run | | AZ ID validation warning | Specified AZ not in the region | Use one of the AZ IDs shown in the "Available AZs" message | | Stack already exists | Previous deployment not cleaned up | Run `destroy.sh` first, or use a different `--stack-name` | diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/deploy-slurm-cluster/SKILL.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/deploy-slurm-cluster/SKILL.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/deploy-slurm-cluster/SKILL.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/deploy-slurm-cluster/SKILL.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/deployment-preflight/SKILL.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/deployment-preflight/SKILL.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/deployment-preflight/SKILL.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/deployment-preflight/SKILL.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/validate-deployment/SKILL.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/validate-deployment/SKILL.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/validate-deployment/SKILL.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/.opencode/skills/validate-deployment/SKILL.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/AGENTS.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/AGENTS.md similarity index 98% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/AGENTS.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/AGENTS.md index a955c54ae..13c74b4e3 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/AGENTS.md +++ b/architectures/sagemaker-hyperpod-eks/slinky-slurm/AGENTS.md @@ -1,7 +1,7 @@ # AGENTS.md — slinky-slurm > Guidelines for AI coding agents operating in the `slinky-slurm` subdirectory of -> `awsome-distributed-training`. This is an infrastructure-as-code project for deploying +> `awsome-distributed-ai`. This is an infrastructure-as-code project for deploying > Slurm on Amazon SageMaker HyperPod EKS via the Slinky Project (SchedMD). ## Project Overview @@ -74,7 +74,7 @@ The repo root has `.markdownlint.jsonc` with these rules: ```bash # From repo root -npx markdownlint-cli2 "1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/**/*.md" +npx markdownlint-cli2 "architectures/sagemaker-hyperpod-eks/slinky-slurm/**/*.md" ``` ### CI Static Analysis (PR workflow) diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/Docker-Build-README.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/Docker-Build-README.md similarity index 96% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/Docker-Build-README.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/Docker-Build-README.md index 430912ffa..1cf6116c2 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/Docker-Build-README.md +++ b/architectures/sagemaker-hyperpod-eks/slinky-slurm/Docker-Build-README.md @@ -8,8 +8,8 @@ This build includes Python 3.12.8 + PyTorch 2.6.0 + CUDA 12.6 + NCCL 2.23.4 + EF Clone the AWSome Distributed Training repo: ``` -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/ +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/architectures/sagemaker-hyperpod-eks/slinky-slurm/ ``` diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/README.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/README.md similarity index 96% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/README.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/README.md index 1a8c9bd36..190a90c03 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/README.md +++ b/architectures/sagemaker-hyperpod-eks/slinky-slurm/README.md @@ -88,8 +88,8 @@ deploy.sh → install.sh → (run workloads) → destroy.sh #### Clone the Repository ``` -git clone https://github.com/awslabs/awsome-distributed-training.git -cp -r awsome-distributed-training/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm . +git clone https://github.com/awslabs/awsome-distributed-ai.git +cp -r awsome-distributed-ai/architectures/sagemaker-hyperpod-eks/slinky-slurm . cd slinky-slurm ``` @@ -331,15 +331,15 @@ apt install -y vim vim --version cd /fsx -git clone https://github.com/awslabs/awsome-distributed-training/ -cd awsome-distributed-training/3.test_cases/pytorch/FSDP/slurm +git clone https://github.com/awslabs/awsome-distributed-ai/ +cd awsome-distributed-ai/examples/training/fsdp/slurm mkdir -p checkpoints ``` --- Copy the modified sbatch file: ``` -export SLINKY_PATH=/fsx/awsome-distributed-training/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm +export SLINKY_PATH=/fsx/awsome-distributed-ai/architectures/sagemaker-hyperpod-eks/slinky-slurm # for g5 instances cp ${SLINKY_PATH}/sbatch/fsdp/g5-llama2_7b-training.sbatch ./llama2_7b-training.sbatch @@ -377,7 +377,7 @@ Watch the error logs from `slurm-worker-slinky-0`: # from a new terminal window kubectl -n slurm exec -it pod/slurm-worker-slinky-0 -- bash --login -cd /fsx/awsome-distributed-training/3.test_cases/pytorch/FSDP/slurm +cd /fsx/awsome-distributed-ai/examples/training/fsdp/slurm export JOB_ID=$(squeue -h -u root -o "%i" | head -1) watch "grep 'Batch.*Loss' logs/llama2_7b-FSDP_${JOB_ID}.err" @@ -403,7 +403,7 @@ Watch checkpoints from `slurm-worker-slinky-2`: # from a new terminal window kubectl -n slurm exec -it pod/slurm-worker-slinky-2 -- bash --login -cd /fsx/awsome-distributed-training/3.test_cases/pytorch/FSDP/slurm +cd /fsx/awsome-distributed-ai/examples/training/fsdp/slurm # highlight changes, show timestamps, 5 second updates watch -n 5 -d "ls -lh checkpoints" diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/buildspec.yml b/architectures/sagemaker-hyperpod-eks/slinky-slurm/buildspec.yml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/buildspec.yml rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/buildspec.yml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/codebuild-stack.yaml b/architectures/sagemaker-hyperpod-eks/slinky-slurm/codebuild-stack.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/codebuild-stack.yaml rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/codebuild-stack.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/codebuild.tf b/architectures/sagemaker-hyperpod-eks/slinky-slurm/codebuild.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/codebuild.tf rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/codebuild.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/custom.tfvars b/architectures/sagemaker-hyperpod-eks/slinky-slurm/custom.tfvars similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/custom.tfvars rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/custom.tfvars diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/deploy.sh b/architectures/sagemaker-hyperpod-eks/slinky-slurm/deploy.sh similarity index 99% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/deploy.sh rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/deploy.sh index bb168439c..1748def0a 100755 --- a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/deploy.sh +++ b/architectures/sagemaker-hyperpod-eks/slinky-slurm/deploy.sh @@ -414,7 +414,7 @@ deploy_tf() { if [[ ! -d "${tf_dir}" ]]; then echo "Error: Terraform modules directory not found: ${tf_dir}" echo " Expected at: ${tf_dir}" - echo " Make sure you have cloned the awsome-distributed-training repo." + echo " Make sure you have cloned the awsome-distributed-ai repo." exit 1 fi diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/destroy.sh b/architectures/sagemaker-hyperpod-eks/slinky-slurm/destroy.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/destroy.sh rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/destroy.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/dlc-slurmd.Dockerfile b/architectures/sagemaker-hyperpod-eks/slinky-slurm/dlc-slurmd.Dockerfile similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/dlc-slurmd.Dockerfile rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/dlc-slurmd.Dockerfile diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/DASHBOARD.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/DASHBOARD.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/DASHBOARD.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/DASHBOARD.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/agent-deployment-skills/idea.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/agent-deployment-skills/idea.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/agent-deployment-skills/idea.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/agent-deployment-skills/idea.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/agent-deployment-skills/plan.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/agent-deployment-skills/plan.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/agent-deployment-skills/plan.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/agent-deployment-skills/plan.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/agent-deployment-skills/shipped.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/agent-deployment-skills/shipped.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/agent-deployment-skills/shipped.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/agent-deployment-skills/shipped.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/deployment-automation/idea.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/deployment-automation/idea.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/deployment-automation/idea.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/deployment-automation/idea.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/deployment-automation/plan.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/deployment-automation/plan.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/deployment-automation/plan.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/deployment-automation/plan.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/deployment-automation/shipped.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/deployment-automation/shipped.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/deployment-automation/shipped.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/deployment-automation/shipped.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/dockerfile-update-codebuild-ci/idea.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/dockerfile-update-codebuild-ci/idea.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/dockerfile-update-codebuild-ci/idea.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/dockerfile-update-codebuild-ci/idea.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/dockerfile-update-codebuild-ci/plan.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/dockerfile-update-codebuild-ci/plan.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/dockerfile-update-codebuild-ci/plan.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/dockerfile-update-codebuild-ci/plan.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/dockerfile-update-codebuild-ci/shipped.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/dockerfile-update-codebuild-ci/shipped.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/dockerfile-update-codebuild-ci/shipped.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/dockerfile-update-codebuild-ci/shipped.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/idea.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/idea.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/idea.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/idea.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/plan.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/plan.md similarity index 97% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/plan.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/plan.md index 597466078..155458ed2 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/plan.md +++ b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/plan.md @@ -10,7 +10,7 @@ completed: 2026-03-06 ## Overview Migrate the CloudFormation deployment path from the old -`awsome-distributed-training` nested stack templates to the official +`awsome-distributed-ai` nested stack templates to the official SageMaker HyperPod service team maintained templates at `github.com/aws/sagemaker-hyperpod-cluster-setup`. @@ -18,7 +18,7 @@ SageMaker HyperPod service team maintained templates at ### Old System (current) -- **Template source:** `awsome-distributed-training/.../cfn-templates/nested-stacks/main-stack.yaml` +- **Template source:** `awsome-distributed-ai/.../cfn-templates/nested-stacks/main-stack.yaml` (curled at deploy time from GitHub raw URL) - **Params format:** Flat key-value pairs with individual parameters for each instance group property (`AcceleratedInstanceType`, `AcceleratedInstanceCount`, diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/shipped.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/shipped.md similarity index 97% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/shipped.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/shipped.md index 8f3e187a9..2f242c5fa 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/shipped.md +++ b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-cfn-template-migration/shipped.md @@ -11,7 +11,7 @@ shipped: 2026-03-12 ## Summary Migrated the CloudFormation deployment path from the legacy -`awsome-distributed-training` nested stack templates (curled from GitHub +`awsome-distributed-ai` nested stack templates (curled from GitHub at deploy time) to the official SageMaker HyperPod service team S3-hosted templates at `aws-sagemaker-hyperpod-cluster-setup--prod`. diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-tf-module-update/idea.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-tf-module-update/idea.md similarity index 92% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-tf-module-update/idea.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-tf-module-update/idea.md index 2cfd5612d..98404f0b7 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-tf-module-update/idea.md +++ b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-tf-module-update/idea.md @@ -16,7 +16,7 @@ The Terraform deployment option (`g5/g5-custom.tfvars` and `p5/p5-custom.tfvars` uses outdated syntax that is incompatible with the latest SageMaker HyperPod EKS Terraform modules. The tfvars files need to be updated to match the current module interface located at -`1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/custom.tfvars`. +`architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/custom.tfvars`. ## Proposed Solution @@ -75,7 +75,7 @@ create_new_fsx_filesystem = true ### Key References - **New module tfvars:** - `1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/custom.tfvars` + `architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/custom.tfvars` - **Existing files to update:** `g5/g5-custom.tfvars`, `p5/p5-custom.tfvars` diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-tf-module-update/plan.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-tf-module-update/plan.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-tf-module-update/plan.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-tf-module-update/plan.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-tf-module-update/shipped.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-tf-module-update/shipped.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-tf-module-update/shipped.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/hyperpod-tf-module-update/shipped.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/slinky-helm-values-update/idea.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/slinky-helm-values-update/idea.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/slinky-helm-values-update/idea.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/slinky-helm-values-update/idea.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/slinky-helm-values-update/plan.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/slinky-helm-values-update/plan.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/slinky-helm-values-update/plan.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/slinky-helm-values-update/plan.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/slinky-helm-values-update/shipped.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/slinky-helm-values-update/shipped.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/slinky-helm-values-update/shipped.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/slinky-helm-values-update/shipped.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/training-plan-support/idea.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/training-plan-support/idea.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/training-plan-support/idea.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/training-plan-support/idea.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/training-plan-support/plan.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/training-plan-support/plan.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/training-plan-support/plan.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/training-plan-support/plan.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/training-plan-support/shipped.md b/architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/training-plan-support/shipped.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/docs/features/training-plan-support/shipped.md rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/docs/features/training-plan-support/shipped.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/install.sh b/architectures/sagemaker-hyperpod-eks/slinky-slurm/install.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/install.sh rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/install.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/lib/deploy_helpers.sh b/architectures/sagemaker-hyperpod-eks/slinky-slurm/lib/deploy_helpers.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/lib/deploy_helpers.sh rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/lib/deploy_helpers.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/lustre-pvc-slurm.yaml.template b/architectures/sagemaker-hyperpod-eks/slinky-slurm/lustre-pvc-slurm.yaml.template similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/lustre-pvc-slurm.yaml.template rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/lustre-pvc-slurm.yaml.template diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/lustre-storageclass.yaml b/architectures/sagemaker-hyperpod-eks/slinky-slurm/lustre-storageclass.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/lustre-storageclass.yaml rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/lustre-storageclass.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/mariadb.yaml b/architectures/sagemaker-hyperpod-eks/slinky-slurm/mariadb.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/mariadb.yaml rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/mariadb.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/openzfs-pvc-slurm.yaml b/architectures/sagemaker-hyperpod-eks/slinky-slurm/openzfs-pvc-slurm.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/openzfs-pvc-slurm.yaml rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/openzfs-pvc-slurm.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/openzfs-storageclass.yaml b/architectures/sagemaker-hyperpod-eks/slinky-slurm/openzfs-storageclass.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/openzfs-storageclass.yaml rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/openzfs-storageclass.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/params.json b/architectures/sagemaker-hyperpod-eks/slinky-slurm/params.json similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/params.json rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/params.json diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/sbatch/fsdp/g5-llama2_7b-training.sbatch b/architectures/sagemaker-hyperpod-eks/slinky-slurm/sbatch/fsdp/g5-llama2_7b-training.sbatch similarity index 98% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/sbatch/fsdp/g5-llama2_7b-training.sbatch rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/sbatch/fsdp/g5-llama2_7b-training.sbatch index ccb1e2cd9..a202c1431 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/sbatch/fsdp/g5-llama2_7b-training.sbatch +++ b/architectures/sagemaker-hyperpod-eks/slinky-slurm/sbatch/fsdp/g5-llama2_7b-training.sbatch @@ -87,7 +87,7 @@ declare -a TORCHRUN_ARGS=( ) export PATH="/usr/local/bin:$PATH" -export TRAIN_SCRIPT="/fsx/awsome-distributed-training/3.test_cases/pytorch/FSDP/src/train.py" +export TRAIN_SCRIPT="/fsx/awsome-distributed-ai/examples/training/fsdp/src/train.py" export PYTHONPATH="/usr/local/lib/python3.12/site-packages:$PYTHONPATH" export TORCHRUN="/usr/local/bin/python3 -m torch.distributed.run" diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/sbatch/fsdp/p5-llama2_7b-training.sbatch b/architectures/sagemaker-hyperpod-eks/slinky-slurm/sbatch/fsdp/p5-llama2_7b-training.sbatch similarity index 98% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/sbatch/fsdp/p5-llama2_7b-training.sbatch rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/sbatch/fsdp/p5-llama2_7b-training.sbatch index 0a23a0f87..4ac3f9f8a 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/sbatch/fsdp/p5-llama2_7b-training.sbatch +++ b/architectures/sagemaker-hyperpod-eks/slinky-slurm/sbatch/fsdp/p5-llama2_7b-training.sbatch @@ -94,7 +94,7 @@ declare -a TORCHRUN_ARGS=( ) export PATH="/usr/local/bin:$PATH" -export TRAIN_SCRIPT="/fsx/awsome-distributed-training/3.test_cases/pytorch/FSDP/src/train.py" +export TRAIN_SCRIPT="/fsx/awsome-distributed-ai/examples/training/fsdp/src/train.py" export PYTHONPATH="/usr/local/lib/python3.12/site-packages:$PYTHONPATH" export TORCHRUN="/usr/local/bin/python3 -m torch.distributed.run" diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/setup.sh b/architectures/sagemaker-hyperpod-eks/slinky-slurm/setup.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/setup.sh rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/setup.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/slinky-slurm-hp-eks.png b/architectures/sagemaker-hyperpod-eks/slinky-slurm/slinky-slurm-hp-eks.png similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/slinky-slurm-hp-eks.png rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/slinky-slurm-hp-eks.png diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/slurm-login-service-patch.yaml.template b/architectures/sagemaker-hyperpod-eks/slinky-slurm/slurm-login-service-patch.yaml.template similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/slurm-login-service-patch.yaml.template rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/slurm-login-service-patch.yaml.template diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/slurm-values.yaml.template b/architectures/sagemaker-hyperpod-eks/slinky-slurm/slurm-values.yaml.template similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/slurm-values.yaml.template rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/slurm-values.yaml.template diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/fixtures/custom.tfvars b/architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/fixtures/custom.tfvars similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/fixtures/custom.tfvars rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/fixtures/custom.tfvars diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/fixtures/params.json b/architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/fixtures/params.json similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/fixtures/params.json rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/fixtures/params.json diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/fixtures/slurm-values.yaml.template b/architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/fixtures/slurm-values.yaml.template similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/fixtures/slurm-values.yaml.template rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/fixtures/slurm-values.yaml.template diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/helpers/mock_aws.bash b/architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/helpers/mock_aws.bash similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/helpers/mock_aws.bash rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/helpers/mock_aws.bash diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/helpers/setup.bash b/architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/helpers/setup.bash similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/helpers/setup.bash rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/helpers/setup.bash diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/install_bats_libs.sh b/architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/install_bats_libs.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/install_bats_libs.sh rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/install_bats_libs.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/test_deploy.bats b/architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/test_deploy.bats similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/test_deploy.bats rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/test_deploy.bats diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/test_destroy.bats b/architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/test_destroy.bats similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/test_destroy.bats rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/test_destroy.bats diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/test_install.bats b/architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/test_install.bats similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/test_install.bats rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/test_install.bats diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/test_setup.bats b/architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/test_setup.bats similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/tests/test_setup.bats rename to architectures/sagemaker-hyperpod-eks/slinky-slurm/tests/test_setup.bats diff --git a/1.architectures/7.sagemaker-hyperpod-eks/smhp-eks-arch.png b/architectures/sagemaker-hyperpod-eks/smhp-eks-arch.png similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/smhp-eks-arch.png rename to architectures/sagemaker-hyperpod-eks/smhp-eks-arch.png diff --git a/1.architectures/7.sagemaker-hyperpod-eks/task-governance/1-imagenet-gpu-team-a.yaml b/architectures/sagemaker-hyperpod-eks/task-governance/1-imagenet-gpu-team-a.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/task-governance/1-imagenet-gpu-team-a.yaml rename to architectures/sagemaker-hyperpod-eks/task-governance/1-imagenet-gpu-team-a.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/task-governance/2-hyperpod-cli-example-team-b.yaml b/architectures/sagemaker-hyperpod-eks/task-governance/2-hyperpod-cli-example-team-b.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/task-governance/2-hyperpod-cli-example-team-b.yaml rename to architectures/sagemaker-hyperpod-eks/task-governance/2-hyperpod-cli-example-team-b.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/task-governance/3-imagenet-gpu-team-b-higher-prio.yaml b/architectures/sagemaker-hyperpod-eks/task-governance/3-imagenet-gpu-team-b-higher-prio.yaml similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/task-governance/3-imagenet-gpu-team-b-higher-prio.yaml rename to architectures/sagemaker-hyperpod-eks/task-governance/3-imagenet-gpu-team-b-higher-prio.yaml diff --git a/1.architectures/7.sagemaker-hyperpod-eks/task-governance/README.md b/architectures/sagemaker-hyperpod-eks/task-governance/README.md similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/task-governance/README.md rename to architectures/sagemaker-hyperpod-eks/task-governance/README.md diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/.gitignore b/architectures/sagemaker-hyperpod-eks/terraform-modules/.gitignore similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/.gitignore rename to architectures/sagemaker-hyperpod-eks/terraform-modules/.gitignore diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md b/architectures/sagemaker-hyperpod-eks/terraform-modules/README.md similarity index 98% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md rename to architectures/sagemaker-hyperpod-eks/terraform-modules/README.md index 054d89435..582a91151 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md +++ b/architectures/sagemaker-hyperpod-eks/terraform-modules/README.md @@ -10,8 +10,8 @@ The diagram below depicts the Terraform modules that have been bundled into a si ## Get the Modules Clone the AWSome Distributed Training repository and navigate to the terraform-modules directory: ```bash -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf ``` --- @@ -274,7 +274,7 @@ For air-gapped or closed network environments without internet access: ```bash # Navigate to the terraform modules directory -cd awsome-distributed-training/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf +cd awsome-distributed-ai/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf # 1. Set your AWS account and region export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) @@ -472,7 +472,7 @@ Once prerequisites are complete and your Helm chart is prepared: ```bash # Navigate to terraform directory -cd awsome-distributed-training/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf +cd awsome-distributed-ai/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf # Initialize Terraform terraform init diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/closed-network.tfvars b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/closed-network.tfvars similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/closed-network.tfvars rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/closed-network.tfvars diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/custom.tfvars b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/custom.tfvars similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/custom.tfvars rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/custom.tfvars diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/eks_cluster/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/eks_cluster/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/eks_cluster/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/eks_cluster/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/eks_cluster/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/eks_cluster/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/eks_cluster/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/eks_cluster/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/eks_cluster/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/eks_cluster/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/eks_cluster/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/eks_cluster/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/fsx_lustre/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/fsx_lustre/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/fsx_lustre/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/fsx_lustre/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/fsx_lustre/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/fsx_lustre/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/fsx_lustre/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/fsx_lustre/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/fsx_lustre/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/fsx_lustre/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/fsx_lustre/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/fsx_lustre/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/cert_manager.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/cert_manager.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/cert_manager.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/cert_manager.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/iam_roles.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/iam_roles.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/iam_roles.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/iam_roles.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/tls_s3_bucket.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/tls_s3_bucket.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/tls_s3_bucket.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/tls_s3_bucket.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_inference_operator/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_training_operator/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_training_operator/main.tf similarity index 96% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_training_operator/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_training_operator/main.tf index 7e5f7df78..2de7151b5 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_training_operator/main.tf +++ b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_training_operator/main.tf @@ -45,7 +45,7 @@ resource "aws_iam_role_policy_attachment" "hpto-policy" { # at https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-eks-operator-install.html # and is immune to future addon-metadata changes. # -# See: https://github.com/awslabs/awsome-distributed-training/issues/1075 +# See: https://github.com/awslabs/awsome-distributed-ai/issues/1075 resource "aws_eks_pod_identity_association" "hpto_association" { cluster_name = var.eks_cluster_name namespace = "aws-hyperpod" diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_training_operator/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_training_operator/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_training_operator/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_training_operator/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_training_operator/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_training_operator/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_training_operator/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_training_operator/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/lifecycle_script/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/lifecycle_script/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/lifecycle_script/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/lifecycle_script/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/lifecycle_script/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/lifecycle_script/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/lifecycle_script/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/lifecycle_script/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/lifecycle_script/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/lifecycle_script/variables.tf similarity index 75% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/lifecycle_script/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/lifecycle_script/variables.tf index 98ffe2129..b38ec6b3a 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/lifecycle_script/variables.tf +++ b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/lifecycle_script/variables.tf @@ -14,7 +14,7 @@ variable "script_urls" { description = "List of raw URLs of the script files to upload" type = list(string) default = [ - "https://raw.githubusercontent.com/awslabs/awsome-distributed-training/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh", - "https://raw.githubusercontent.com/awslabs/awsome-distributed-training/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create_main.sh" + "https://raw.githubusercontent.com/awslabs/awsome-distributed-ai/main/architectures/sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh", + "https://raw.githubusercontent.com/awslabs/awsome-distributed-ai/main/architectures/sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create_main.sh" ] } diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/eks_addon.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/eks_addon.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/eks_addon.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/eks_addon.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/grafana_workspace.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/grafana_workspace.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/grafana_workspace.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/grafana_workspace.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/iam_roles.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/iam_roles.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/iam_roles.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/iam_roles.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/prometheus_workspace.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/prometheus_workspace.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/prometheus_workspace.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/prometheus_workspace.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/version.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/version.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/version.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/version.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/vpc_endpoints.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/vpc_endpoints.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/vpc_endpoints.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/observability/vpc_endpoints.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/private_subnet/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/private_subnet/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/private_subnet/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/private_subnet/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/private_subnet/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/private_subnet/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/private_subnet/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/private_subnet/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/private_subnet/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/private_subnet/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/private_subnet/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/private_subnet/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/s3_bucket/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/security_group/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/security_group/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/security_group/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/security_group/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/security_group/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/security_group/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/security_group/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/security_group/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/security_group/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/security_group/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/security_group/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/security_group/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/scripts/manage-compute-quota.sh b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/scripts/manage-compute-quota.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/scripts/manage-compute-quota.sh rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/scripts/manage-compute-quota.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/task_governance/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc_endpoints/main.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc_endpoints/main.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc_endpoints/main.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc_endpoints/main.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc_endpoints/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc_endpoints/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc_endpoints/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc_endpoints/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc_endpoints/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc_endpoints/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc_endpoints/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/vpc_endpoints/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/providers.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/providers.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/providers.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/providers.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/rig_custom.tfvars b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/rig_custom.tfvars similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/rig_custom.tfvars rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/rig_custom.tfvars diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/scripts/delete-hyperpod-nodes.sh b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/scripts/delete-hyperpod-nodes.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/scripts/delete-hyperpod-nodes.sh rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/scripts/delete-hyperpod-nodes.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/scripts/guardduty-cleanup.sh b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/scripts/guardduty-cleanup.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/scripts/guardduty-cleanup.sh rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/scripts/guardduty-cleanup.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/scripts/wait-for-hyperpod-nodes.sh b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/scripts/wait-for-hyperpod-nodes.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/scripts/wait-for-hyperpod-nodes.sh rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/scripts/wait-for-hyperpod-nodes.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/terraform.tfvars b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/terraform.tfvars similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/terraform.tfvars rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/terraform.tfvars diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/copy-images-to-ecr.sh b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/copy-images-to-ecr.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/copy-images-to-ecr.sh rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/copy-images-to-ecr.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/ecr-images.conf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/ecr-images.conf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/ecr-images.conf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/ecr-images.conf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/list-ecr-repos.sh b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/list-ecr-repos.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/list-ecr-repos.sh rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/list-ecr-repos.sh diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/update-values-with-ecr.py b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/update-values-with-ecr.py similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/update-values-with-ecr.py rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/update-values-with-ecr.py diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/verify-aws-connectivity.py b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/verify-aws-connectivity.py similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/verify-aws-connectivity.py rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/tools/verify-aws-connectivity.py diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/versions.tf b/architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/versions.tf similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/versions.tf rename to architectures/sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/versions.tf diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/smhp_tf_modules.png b/architectures/sagemaker-hyperpod-eks/terraform-modules/smhp_tf_modules.png similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/smhp_tf_modules.png rename to architectures/sagemaker-hyperpod-eks/terraform-modules/smhp_tf_modules.png diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/terraform_outputs.sh b/architectures/sagemaker-hyperpod-eks/terraform-modules/terraform_outputs.sh similarity index 100% rename from 1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/terraform_outputs.sh rename to architectures/sagemaker-hyperpod-eks/terraform-modules/terraform_outputs.sh diff --git a/1.architectures/5.sagemaker-hyperpod/0.AmazonSageMakerClustersExecutionRoleTrustedEntities.json b/architectures/sagemaker-hyperpod-slurm/0.AmazonSageMakerClustersExecutionRoleTrustedEntities.json similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/0.AmazonSageMakerClustersExecutionRoleTrustedEntities.json rename to architectures/sagemaker-hyperpod-slurm/0.AmazonSageMakerClustersExecutionRoleTrustedEntities.json diff --git a/1.architectures/5.sagemaker-hyperpod/1.AmazonSageMakerClustersExecutionRolePolicy.json b/architectures/sagemaker-hyperpod-slurm/1.AmazonSageMakerClustersExecutionRolePolicy.json similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/1.AmazonSageMakerClustersExecutionRolePolicy.json rename to architectures/sagemaker-hyperpod-slurm/1.AmazonSageMakerClustersExecutionRolePolicy.json diff --git a/1.architectures/5.sagemaker-hyperpod/2.SageMakerVPC.yaml b/architectures/sagemaker-hyperpod-slurm/2.SageMakerVPC.yaml similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/2.SageMakerVPC.yaml rename to architectures/sagemaker-hyperpod-slurm/2.SageMakerVPC.yaml diff --git a/1.architectures/5.sagemaker-hyperpod/3.FSxLustre.yaml b/architectures/sagemaker-hyperpod-slurm/3.FSxLustre.yaml similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/3.FSxLustre.yaml rename to architectures/sagemaker-hyperpod-slurm/3.FSxLustre.yaml diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/add-users/.gitattributes b/architectures/sagemaker-hyperpod-slurm/Extensions/add-users/.gitattributes similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/add-users/.gitattributes rename to architectures/sagemaker-hyperpod-slurm/Extensions/add-users/.gitattributes diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/add-users/README.md b/architectures/sagemaker-hyperpod-slurm/Extensions/add-users/README.md similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/add-users/README.md rename to architectures/sagemaker-hyperpod-slurm/Extensions/add-users/README.md diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/add-users/add_users.sh b/architectures/sagemaker-hyperpod-slurm/Extensions/add-users/add_users.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/add-users/add_users.sh rename to architectures/sagemaker-hyperpod-slurm/Extensions/add-users/add_users.sh diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/add-users/create_posix_users.sh b/architectures/sagemaker-hyperpod-slurm/Extensions/add-users/create_posix_users.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/add-users/create_posix_users.sh rename to architectures/sagemaker-hyperpod-slurm/Extensions/add-users/create_posix_users.sh diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/add-users/setup_home_dirs.sh b/architectures/sagemaker-hyperpod-slurm/Extensions/add-users/setup_home_dirs.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/add-users/setup_home_dirs.sh rename to architectures/sagemaker-hyperpod-slurm/Extensions/add-users/setup_home_dirs.sh diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/add-users/setup_slurm_accounts.sh b/architectures/sagemaker-hyperpod-slurm/Extensions/add-users/setup_slurm_accounts.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/add-users/setup_slurm_accounts.sh rename to architectures/sagemaker-hyperpod-slurm/Extensions/add-users/setup_slurm_accounts.sh diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/add-users/setup_ssh_keys.sh b/architectures/sagemaker-hyperpod-slurm/Extensions/add-users/setup_ssh_keys.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/add-users/setup_ssh_keys.sh rename to architectures/sagemaker-hyperpod-slurm/Extensions/add-users/setup_ssh_keys.sh diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/add-users/shared_users_sample.txt b/architectures/sagemaker-hyperpod-slurm/Extensions/add-users/shared_users_sample.txt similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/add-users/shared_users_sample.txt rename to architectures/sagemaker-hyperpod-slurm/Extensions/add-users/shared_users_sample.txt diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/add-users/shared_users_sample.yaml b/architectures/sagemaker-hyperpod-slurm/Extensions/add-users/shared_users_sample.yaml similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/add-users/shared_users_sample.yaml rename to architectures/sagemaker-hyperpod-slurm/Extensions/add-users/shared_users_sample.yaml diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/detect-node/README.md b/architectures/sagemaker-hyperpod-slurm/Extensions/detect-node/README.md similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/detect-node/README.md rename to architectures/sagemaker-hyperpod-slurm/Extensions/detect-node/README.md diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/detect-node/detect_node.sh b/architectures/sagemaker-hyperpod-slurm/Extensions/detect-node/detect_node.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/detect-node/detect_node.sh rename to architectures/sagemaker-hyperpod-slurm/Extensions/detect-node/detect_node.sh diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/.gitattributes b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/.gitattributes similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/.gitattributes rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/.gitattributes diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/LICENSE_SLURM_EXPORTER.txt b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/LICENSE_SLURM_EXPORTER.txt similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/LICENSE_SLURM_EXPORTER.txt rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/LICENSE_SLURM_EXPORTER.txt diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/README.md b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/README.md similarity index 99% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/README.md rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/README.md index c3ebdafbf..75ef0a6b5 100644 --- a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/README.md +++ b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/README.md @@ -144,7 +144,7 @@ Deploy the CloudFormation stack that provisions an AMP workspace and an Amazon Managed Grafana workspace: ```bash -wget https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/main/4.validation_and_observability/4.prometheus-grafana/cluster-observability.yaml +wget https://raw.githubusercontent.com/aws-samples/awsome-distributed-ai/main/validation_and_observability/prometheus-grafana/cluster-observability.yaml aws cloudformation create-stack \ --stack-name hyperpod-observability \ diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/config.json b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/config.json similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/config.json rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/config.json diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/dcgm_metrics_config/dcgm-metrics-advanced.csv b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/dcgm_metrics_config/dcgm-metrics-advanced.csv similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/dcgm_metrics_config/dcgm-metrics-advanced.csv rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/dcgm_metrics_config/dcgm-metrics-advanced.csv diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/dcgm_metrics_config/dcgm-metrics-basic.csv b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/dcgm_metrics_config/dcgm-metrics-basic.csv similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/dcgm_metrics_config/dcgm-metrics-basic.csv rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/dcgm_metrics_config/dcgm-metrics-basic.csv diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/install_dcgm_exporter.sh b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/install_dcgm_exporter.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/install_dcgm_exporter.sh rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/install_dcgm_exporter.sh diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/install_efa_exporter.sh b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/install_efa_exporter.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/install_efa_exporter.sh rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/install_efa_exporter.sh diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/install_node_exporter.sh b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/install_node_exporter.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/install_node_exporter.sh rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/install_node_exporter.sh diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/install_observability.py b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/install_observability.py similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/install_observability.py rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/install_observability.py diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/install_otel_collector.sh b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/install_otel_collector.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/install_otel_collector.sh rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/install_otel_collector.sh diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/install_slurm_exporter.sh b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/install_slurm_exporter.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/install_slurm_exporter.sh rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/install_slurm_exporter.sh diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/otel_config/config-compute-template.yaml b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/otel_config/config-compute-template.yaml similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/otel_config/config-compute-template.yaml rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/otel_config/config-compute-template.yaml diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/otel_config/config-head-template.yaml b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/otel_config/config-head-template.yaml similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/otel_config/config-head-template.yaml rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/otel_config/config-head-template.yaml diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/otel_config/config-login-template.yaml b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/otel_config/config-login-template.yaml similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/otel_config/config-login-template.yaml rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/otel_config/config-login-template.yaml diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/setup_observability.sh b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/setup_observability.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/setup_observability.sh rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/setup_observability.sh diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/observability/stop_observability.py b/architectures/sagemaker-hyperpod-slurm/Extensions/observability/stop_observability.py similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/observability/stop_observability.py rename to architectures/sagemaker-hyperpod-slurm/Extensions/observability/stop_observability.py diff --git a/1.architectures/5.sagemaker-hyperpod/Extensions/run_extensions.sh b/architectures/sagemaker-hyperpod-slurm/Extensions/run_extensions.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/Extensions/run_extensions.sh rename to architectures/sagemaker-hyperpod-slurm/Extensions/run_extensions.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/add_users.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/add_users.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/add_users.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/add_users.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/apply_hotfix.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/apply_hotfix.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/apply_hotfix.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/apply_hotfix.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/config.py b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/config.py similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/config.py rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/config.py diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/epilog.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/epilog.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/epilog.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/epilog.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/hold-lustre-client.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/hotfix/hold-lustre-client.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/hold-lustre-client.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/hotfix/hold-lustre-client.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/mock-gpu-driver-deb.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/hotfix/mock-gpu-driver-deb.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/mock-gpu-driver-deb.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/hotfix/mock-gpu-driver-deb.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/lifecycle_script.py similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/lifecycle_script.py diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/mount_fsx.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/mount_fsx.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/mount_fsx_openzfs.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/mount_fsx_openzfs.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/multi_headnode_setup/headnode_notification.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/multi_headnode_setup/headnode_notification.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/multi_headnode_setup/headnode_notification.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/multi_headnode_setup/headnode_notification.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/multi_headnode_setup/headnode_setup.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/multi_headnode_setup/headnode_setup.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/multi_headnode_setup/headnode_setup.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/multi_headnode_setup/headnode_setup.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/LICENSE_SLURM_EXPORTER.txt b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/LICENSE_SLURM_EXPORTER.txt similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/LICENSE_SLURM_EXPORTER.txt rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/LICENSE_SLURM_EXPORTER.txt diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/dcgm_metrics_config/dcgm-metrics-advanced.csv b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/dcgm_metrics_config/dcgm-metrics-advanced.csv similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/dcgm_metrics_config/dcgm-metrics-advanced.csv rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/dcgm_metrics_config/dcgm-metrics-advanced.csv diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/dcgm_metrics_config/dcgm-metrics-basic.csv b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/dcgm_metrics_config/dcgm-metrics-basic.csv similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/dcgm_metrics_config/dcgm-metrics-basic.csv rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/dcgm_metrics_config/dcgm-metrics-basic.csv diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_dcgm_exporter.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/install_dcgm_exporter.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_dcgm_exporter.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/install_dcgm_exporter.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_efa_exporter.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/install_efa_exporter.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_efa_exporter.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/install_efa_exporter.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_node_exporter.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/install_node_exporter.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_node_exporter.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/install_node_exporter.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_observability.py b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/install_observability.py similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_observability.py rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/install_observability.py diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_otel_collector.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/install_otel_collector.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_otel_collector.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/install_otel_collector.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_slurm_exporter.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/install_slurm_exporter.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/install_slurm_exporter.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/install_slurm_exporter.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/otel_config/config-compute-template.yaml b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/otel_config/config-compute-template.yaml similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/otel_config/config-compute-template.yaml rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/otel_config/config-compute-template.yaml diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/otel_config/config-head-template.yaml b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/otel_config/config-head-template.yaml similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/otel_config/config-head-template.yaml rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/otel_config/config-head-template.yaml diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/otel_config/config-login-template.yaml b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/otel_config/config-login-template.yaml similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/otel_config/config-login-template.yaml rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/otel_config/config-login-template.yaml diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/stop_observability.py b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/stop_observability.py similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/observability/stop_observability.py rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/observability/stop_observability.py diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/on_create.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/on_create.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/on_create.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/on_create.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/prolog.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/prolog.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/prolog.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/prolog.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_mariadb_accounting.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/setup_mariadb_accounting.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_mariadb_accounting.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/setup_mariadb_accounting.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_rds_accounting.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/setup_rds_accounting.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_rds_accounting.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/setup_rds_accounting.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_sssd.py b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/setup_sssd.py similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_sssd.py rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/setup_sssd.py diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_user_associations.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/setup_user_associations.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_user_associations.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/setup_user_associations.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/shared_users_sample.txt b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/shared_users_sample.txt similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/shared_users_sample.txt rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/shared_users_sample.txt diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/start_slurm.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/start_slurm.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/start_slurm.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/start_slurm.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/create_users.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/create_users.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/create_users.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/create_users.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/enable_slurm_log_rotation.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/enable_slurm_log_rotation.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/enable_slurm_log_rotation.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/enable_slurm_log_rotation.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/enroot.conf b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/enroot.conf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/enroot.conf rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/enroot.conf diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_auto_detect.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/fsx_auto_detect.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_auto_detect.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/fsx_auto_detect.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/fsx_ubuntu.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/fsx_ubuntu.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_ansible.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/install_ansible.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_ansible.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/install_ansible.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_docker.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/install_docker.sh similarity index 98% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_docker.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/install_docker.sh index 58df7fcdd..f3d9503b6 100755 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_docker.sh +++ b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/install_docker.sh @@ -63,7 +63,7 @@ sudo usermod -aG docker ubuntu # Opportunistically use /opt/sagemaker or /opt/dlami/nvme if present. Let's be extra careful in the probe. -# See: https://github.com/awslabs/awsome-distributed-training/issues/127 +# See: https://github.com/awslabs/awsome-distributed-ai/issues/127 # # Docker workdir doesn't like Lustre. Tried with storage driver overlay2, fuse-overlayfs, & vfs. if [[ $(mount | grep /opt/sagemaker) ]]; then diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_enroot_pyxis.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/install_enroot_pyxis.sh similarity index 98% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_enroot_pyxis.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/install_enroot_pyxis.sh index de6282bc0..08db4efa1 100755 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_enroot_pyxis.sh +++ b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/install_enroot_pyxis.sh @@ -119,7 +119,7 @@ while true; do ELAPSED_TIME=$((ELAPSED_TIME + CHECK_INTERVAL)) if [[ $ELAPSED_TIME -ge $MAX_WAIT_TIME ]]; then - echo "WARN: Timeout reached: dlami-nvme.service did not become active and successful, it is possible enroot default path is /opt/sagemaker. When training larger models, dragons be here. See https://github.com/awslabs/awsome-distributed-training/issues/427 for corrective actions" + echo "WARN: Timeout reached: dlami-nvme.service did not become active and successful, it is possible enroot default path is /opt/sagemaker. When training larger models, dragons be here. See https://github.com/awslabs/awsome-distributed-ai/issues/427 for corrective actions" break fi diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/motd.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/motd.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/motd.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/motd.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/motd.txt b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/motd.txt similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/motd.txt rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/motd.txt diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/mount-s3.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/mount-s3.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/mount-s3.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/mount-s3.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/pam_adopt_cgroup_wheel.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/pam_adopt_cgroup_wheel.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/pam_adopt_cgroup_wheel.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/pam_adopt_cgroup_wheel.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/slurm_fix_plugstackconf.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/slurm_fix_plugstackconf.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/slurm_fix_plugstackconf.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/slurm_fix_plugstackconf.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/ssh-to-compute.sh b/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/ssh-to-compute.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/ssh-to-compute.sh rename to architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/utils/ssh-to-compute.sh diff --git a/1.architectures/5.sagemaker-hyperpod/README.md b/architectures/sagemaker-hyperpod-slurm/README.md similarity index 98% rename from 1.architectures/5.sagemaker-hyperpod/README.md rename to architectures/sagemaker-hyperpod-slurm/README.md index 6f4b7791a..ff34f93fd 100644 --- a/1.architectures/5.sagemaker-hyperpod/README.md +++ b/architectures/sagemaker-hyperpod-slurm/README.md @@ -10,7 +10,7 @@ SageMaker HyperPod clusters provide the ability to create customized clusters, t The example that follows describes the process of setting up a SageMaker HyperPod cluster with an attached FSX for Lustre volume. -**Note: For a guided set-up experience, check out the [HyperPod automation script](https://github.com/awslabs/awsome-distributed-training/tree/main/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/README.md).** +**Note: For a guided set-up experience, check out the [HyperPod automation script](https://github.com/awslabs/awsome-distributed-ai/tree/main/architectures/sagemaker-hyperpod-slurm/automate-smhp-slurm/README.md).** ## 2. Prerequisites @@ -65,7 +65,7 @@ Now we can create a VPC. This is only necessary if you want to attach your Hyper You can create a VPC using the configuration in [2.SageMakerVPC.yaml](./2.SageMakerVPC.yaml). Which is also available via [
 1-Click Deploy 🚀 
](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/quickcreate?templateURL=https://awsome-distributed-training.s3.amazonaws.com/templates/Vpc.yaml&stackName=SageMakerVPC) - + Feel free to change the stack and VPC names. Make sure to select an availability zone that supports your preferred instance type ([Find an Amazon EC2 instance type](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-discovery.html)). Leave both S3 and DynamoDB endpoints set to True. You can leave the IAM role blank. @@ -77,7 +77,7 @@ FSx for Lustre provides a shared high performance file system that's accessible Similar to the VPC we just created, you can create an FSx for Lustre volume using [3.FSxLustre.yaml](./3.FSxLustre.yaml), or by using [
 1-Click Deploy 🚀 
](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/quickcreate?templateURL=https://awsome-distributed-training.s3.amazonaws.com/templates/FSxLustre.yaml&stackName=FSxLustre) - + Change the stack name, capacity, throughput, and compression configurations as you wish. Select the latest Lustre version (2.15 by default). Under Network Options, select the Security Group ID and Private Subnet ID you created using the VPC CloudFormation stack in the previous step. Once again, you can leave the IAM role blank. @@ -237,7 +237,7 @@ aws ec2 describe-security-groups \ ### 3.3 Launch a new cluster -Now that everything is in place, we can launch our cluster with the command from the `5.sagemaker-hyperpod` directory. +Now that everything is in place, we can launch our cluster with the command from the `sagemaker-hyperpod-slurm` directory. ``` aws sagemaker create-cluster \ @@ -356,7 +356,7 @@ You can also run validation on the scripts you wish to run. This ensures you’r ``` # Run a check on a specific sbatch script that launches training -python3 hyperpod-precheck.py -f ../../3.test_cases/1.megatron-lm/2.distributed-training.sbatch +python3 hyperpod-precheck.py -f ../../examples/1.megatron-lm/2.distributed-training.sbatch ``` diff --git a/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/README.md b/architectures/sagemaker-hyperpod-slurm/automate-smhp-slurm/README.md similarity index 90% rename from 1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/README.md rename to architectures/sagemaker-hyperpod-slurm/automate-smhp-slurm/README.md index 41f458dbf..c64d9bf08 100644 --- a/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/README.md +++ b/architectures/sagemaker-hyperpod-slurm/automate-smhp-slurm/README.md @@ -7,7 +7,7 @@ It handles the installation and configuration of necessary tools, clones the req ## Demo -![SageMaker Hyperpod Cluster Automation Demo](/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/media/automate-smhp-demo.gif) +![SageMaker Hyperpod Cluster Automation Demo](/architectures/sagemaker-hyperpod-slurm/automate-smhp-slurm/media/automate-smhp-demo.gif) This demo gif showcases the step-by-step process of creating and setting up a SageMaker Hyperpod cluster using our automation script. @@ -27,8 +27,8 @@ This demo gif showcases the step-by-step process of creating and setting up a Sa 1. Clone this repository: ```bash - git clone https://github.com/awslabs/awsome-distributed-training.git - cd 1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm + git clone https://github.com/awslabs/awsome-distributed-ai.git + cd architectures/sagemaker-hyperpod-slurm/automate-smhp-slurm ``` 2. Make the script executable: @@ -48,7 +48,7 @@ The script will guide you through the following steps: 1. Check and install/update AWS CLI if necessary. 2. Verify Git installation. -3. Clone the "awsome-distributed-training" repository. +3. Clone the "awsome-distributed-ai" repository. 4. Set up environment variables. 5. Configure lifecycle scripts for SageMaker Hyperpod. diff --git a/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/automate-cluster-creation.sh b/architectures/sagemaker-hyperpod-slurm/automate-smhp-slurm/automate-cluster-creation.sh similarity index 97% rename from 1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/automate-cluster-creation.sh rename to architectures/sagemaker-hyperpod-slurm/automate-smhp-slurm/automate-cluster-creation.sh index e6befeafb..74722a966 100755 --- a/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/automate-cluster-creation.sh +++ b/architectures/sagemaker-hyperpod-slurm/automate-smhp-slurm/automate-cluster-creation.sh @@ -105,7 +105,7 @@ check_git() { } clone_adt() { - REPO_NAME="awsome-distributed-training" + REPO_NAME="awsome-distributed-ai" if [ -d "$REPO_NAME" ]; then echo -e "${YELLOW}⚠️ The directory '$REPO_NAME' already exists.${NC}" echo -e "${GREEN}Do you want to remove it and clone again? (yes/no): ${NC}" @@ -114,14 +114,14 @@ clone_adt() { echo -e "${YELLOW}Removing existing directory...${NC}" rm -rf "$REPO_NAME" echo -e "${BLUE}Cloning repository...${NC}" - git clone --depth=1 https://github.com/awslabs/awsome-distributed-training/ + git clone --depth=1 https://github.com/awslabs/awsome-distributed-ai/ echo -e "${GREEN}✅ Repository cloned successfully${NC}" else echo -e "${BLUE}Using existing directory...${NC}" fi else echo -e "${BLUE}Cloning repository $REPO_NAME...${NC}" - git clone --depth=1 https://github.com/awslabs/awsome-distributed-training/ + git clone --depth=1 https://github.com/awslabs/awsome-distributed-ai/ echo -e "${GREEN}✅ Repository cloned successfully${NC}" fi } @@ -166,7 +166,7 @@ multi_headnode() { echo -e "${YELLOW}The following CloudFormation command will be executed:${NC}" echo -e "${GREEN}aws cloudformation deploy \\ - --template-file awsome-distributed-training/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod-slurm-multi-headnode.yaml \\ + --template-file awsome-distributed-ai/architectures/sagemaker-hyperpod-slurm/sagemaker-hyperpod-slurm-multi-headnode.yaml \\ --stack-name ${MULTI_HEAD_SLURM_STACK} \\ --parameter-overrides \\ SlurmDBSecurityGroupId=${SECURITY_GROUP} \\ @@ -185,7 +185,7 @@ multi_headnode() { # Deploy the multi-head CF stack aws cloudformation deploy \ - --template-file awsome-distributed-training/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod-slurm-multi-headnode.yaml \ + --template-file awsome-distributed-ai/architectures/sagemaker-hyperpod-slurm/sagemaker-hyperpod-slurm-multi-headnode.yaml \ --stack-name ${MULTI_HEAD_SLURM_STACK} \ --parameter-overrides \ SlurmDBSecurityGroupId=${SECURITY_GROUP} \ @@ -237,7 +237,7 @@ multi_headnode() { create_and_attach_policy() { aws iam create-policy \ --policy-name AmazonSageMakerExecutionPolicy \ - --policy-document file://awsome-distributed-training/1.architectures/5.sagemaker-hyperpod/1.AmazonSageMakerClustersExecutionRolePolicy.json --output json && \ + --policy-document file://awsome-distributed-ai/architectures/sagemaker-hyperpod-slurm/1.AmazonSageMakerClustersExecutionRolePolicy.json --output json && \ aws iam attach-role-policy \ --role-name $SLURM_EXECUTION_ROLE \ --policy-arn arn:aws:iam::${AWS_ACCOUNT_ID}:policy/AmazonSageMakerExecutionPolicy @@ -251,13 +251,13 @@ multi_headnode() { if [[ $error_output == *"EntityAlreadyExists"* ]]; then echo -e "\n${YELLOW}If the error you received is that the policy already exists, you can either:${NC}" - echo -e "\n${GREEN} 1. Continue the script with the existing policy (make sure the permissions match the ones in https://github.com/awslabs/awsome-distributed-training/blob/main/1.architectures/5.sagemaker-hyperpod/1.AmazonSageMakerClustersExecutionRolePolicy.json) and manually attach it to your role ${SLURM_EXECUTION_ROLE}, or${NC}" + echo -e "\n${GREEN} 1. Continue the script with the existing policy (make sure the permissions match the ones in https://github.com/awslabs/awsome-distributed-ai/blob/main/architectures/sagemaker-hyperpod-slurm/1.AmazonSageMakerClustersExecutionRolePolicy.json) and manually attach it to your role ${SLURM_EXECUTION_ROLE}, or${NC}" echo -e "\n${GREEN} 2. You can create a new policy with a name different than 'AmazonSageMakerExecutionPolicy' manually and attach it to your 'AmazonSageMakerExecutionRole' with the following command. Once you do that, you can continue with the rest of the script:${NC}" echo -e "\n${YELLOW} Creating an IAM policy (required for option 2 above)${NC}" echo -e "\n${BLUE} aws iam create-policy \\ --policy-name \\ - --policy-document file://awsome-distributed-training/1.architectures/5.sagemaker-hyperpod/1.AmazonSageMakerClustersExecutionRolePolicy.json${NC}" + --policy-document file://awsome-distributed-ai/architectures/sagemaker-hyperpod-slurm/1.AmazonSageMakerClustersExecutionRolePolicy.json${NC}" echo -e "\n${YELLOW} Attach an IAM policy to an IAM role (required for options 1 & 2 above)${NC}" echo -e "\n${BLUE} aws iam attach-role-policy \\ @@ -287,7 +287,7 @@ multi_headnode() { # Function to setup environment variables setup_env_vars() { echo -e "${BLUE}=== Setting Up Environment Variables ===${NC}" - echo -e "${GREEN}Cloning awsome-distributed-training${NC}" + echo -e "${GREEN}Cloning awsome-distributed-ai${NC}" clone_adt echo -e "${BLUE}Enter the name of the SageMaker VPC CloudFormation stack that was deployed as a prerequisite (default: sagemaker-hyperpod):${NC}" @@ -306,7 +306,7 @@ setup_env_vars() { echo -e "${YELLOW}Generating new environment variables...${NC}" generate_env_vars() { - bash awsome-distributed-training/1.architectures/5.sagemaker-hyperpod/create_config.sh + bash awsome-distributed-ai/architectures/sagemaker-hyperpod-slurm/create_config.sh # bash create_config.sh } @@ -345,7 +345,7 @@ setup_env_vars() { setup_lifecycle_scripts() { echo -e "${BLUE}=== Setting Up Lifecycle Scripts ===${NC}" - cd awsome-distributed-training/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/ + cd awsome-distributed-ai/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/ # Check if FSx OpenZFS was deployed in the stack echo -e "${BLUE}Checking if FSx OpenZFS was deployed in the stack...${NC}" @@ -873,7 +873,7 @@ validate_cluster_config() { echo "Validating your cluster configuration..." # TODO: MAKE SURE PACKAGES ARE INSTALLED HERE!! - curl -O https://raw.githubusercontent.com/awslabs/awsome-distributed-training/main/1.architectures/5.sagemaker-hyperpod/validate-config.py + curl -O https://raw.githubusercontent.com/awslabs/awsome-distributed-ai/main/architectures/sagemaker-hyperpod-slurm/validate-config.py # check config for known issues python3 validate-config.py --cluster-config cluster-config.json --provisioning-parameters provisioning_parameters.json --region $AWS_REGION diff --git a/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/media/automate-smhp-demo.gif b/architectures/sagemaker-hyperpod-slurm/automate-smhp-slurm/media/automate-smhp-demo.gif similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/media/automate-smhp-demo.gif rename to architectures/sagemaker-hyperpod-slurm/automate-smhp-slurm/media/automate-smhp-demo.gif diff --git a/1.architectures/5.sagemaker-hyperpod/create_config.sh b/architectures/sagemaker-hyperpod-slurm/create_config.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/create_config.sh rename to architectures/sagemaker-hyperpod-slurm/create_config.sh diff --git a/1.architectures/5.sagemaker-hyperpod/easy-ssh.sh b/architectures/sagemaker-hyperpod-slurm/easy-ssh.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/easy-ssh.sh rename to architectures/sagemaker-hyperpod-slurm/easy-ssh.sh diff --git a/1.architectures/5.sagemaker-hyperpod/health_check/README.md b/architectures/sagemaker-hyperpod-slurm/health_check/README.md similarity index 99% rename from 1.architectures/5.sagemaker-hyperpod/health_check/README.md rename to architectures/sagemaker-hyperpod-slurm/health_check/README.md index d839768c6..fef3ce7fa 100644 --- a/1.architectures/5.sagemaker-hyperpod/health_check/README.md +++ b/architectures/sagemaker-hyperpod-slurm/health_check/README.md @@ -57,8 +57,8 @@ Place the orchestrator and your health check script on shared storage: ``` cd /fsx/ubuntu -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/1.architectures/5.sagemaker-hyperpod/health_check +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/architectures/sagemaker-hyperpod-slurm/health_check chmod +x health_check_orchestrator.sh dcgm.sh prolog_dcgm.sh cp health_check_orchestrator.sh dcgm.sh prolog_dcgm.sh /fsx/ubuntu/ ``` diff --git a/1.architectures/5.sagemaker-hyperpod/health_check/dcgm.sh b/architectures/sagemaker-hyperpod-slurm/health_check/dcgm.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/health_check/dcgm.sh rename to architectures/sagemaker-hyperpod-slurm/health_check/dcgm.sh diff --git a/1.architectures/5.sagemaker-hyperpod/health_check/health_check_orchestrator.sh b/architectures/sagemaker-hyperpod-slurm/health_check/health_check_orchestrator.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/health_check/health_check_orchestrator.sh rename to architectures/sagemaker-hyperpod-slurm/health_check/health_check_orchestrator.sh diff --git a/1.architectures/5.sagemaker-hyperpod/health_check/prolog_dcgm.sh b/architectures/sagemaker-hyperpod-slurm/health_check/prolog_dcgm.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/health_check/prolog_dcgm.sh rename to architectures/sagemaker-hyperpod-slurm/health_check/prolog_dcgm.sh diff --git a/1.architectures/5.sagemaker-hyperpod/hyperpod-precheck.py b/architectures/sagemaker-hyperpod-slurm/hyperpod-precheck.py similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/hyperpod-precheck.py rename to architectures/sagemaker-hyperpod-slurm/hyperpod-precheck.py diff --git a/1.architectures/5.sagemaker-hyperpod/neuron-sdk-management.md b/architectures/sagemaker-hyperpod-slurm/neuron-sdk-management.md similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/neuron-sdk-management.md rename to architectures/sagemaker-hyperpod-slurm/neuron-sdk-management.md diff --git a/1.architectures/5.sagemaker-hyperpod/patching-backup.sh b/architectures/sagemaker-hyperpod-slurm/patching-backup.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/patching-backup.sh rename to architectures/sagemaker-hyperpod-slurm/patching-backup.sh diff --git a/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod-slurm-multi-headnode.yaml b/architectures/sagemaker-hyperpod-slurm/sagemaker-hyperpod-slurm-multi-headnode.yaml similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod-slurm-multi-headnode.yaml rename to architectures/sagemaker-hyperpod-slurm/sagemaker-hyperpod-slurm-multi-headnode.yaml diff --git a/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml b/architectures/sagemaker-hyperpod-slurm/sagemaker-hyperpod.yaml similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml rename to architectures/sagemaker-hyperpod-slurm/sagemaker-hyperpod.yaml diff --git a/1.architectures/5.sagemaker-hyperpod/slurm-studio/README.md b/architectures/sagemaker-hyperpod-slurm/slurm-studio/README.md similarity index 93% rename from 1.architectures/5.sagemaker-hyperpod/slurm-studio/README.md rename to architectures/sagemaker-hyperpod-slurm/slurm-studio/README.md index ce71c0851..eab365696 100644 --- a/1.architectures/5.sagemaker-hyperpod/slurm-studio/README.md +++ b/architectures/sagemaker-hyperpod-slurm/slurm-studio/README.md @@ -10,7 +10,7 @@ We will help set up your Studio environment so that: **Why login nodes?** Login nodes allow users to login to the cluster, submit jobs, and view and manipulate data without running on the critical `slurmctld` scheduler node. This also allows you to run monitoring servers like [aim](https://github.com/aimhubio/aim), [Tensorboard](https://www.tensorflow.org/tensorboard), or [Grafana/Prometheus](https://prometheus.io/docs/visualization/grafana/). -![SageMaker Studio with Hyperpod integration](/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/01-studio-hyperpod-architecture.png) +![SageMaker Studio with Hyperpod integration](/architectures/sagemaker-hyperpod-slurm/slurm-studio/media/01-studio-hyperpod-architecture.png) ## Table of Contents @@ -59,17 +59,17 @@ You can deploy the CloudFormation template, which creates the following resource 2. Associates the `security-group-for-inbound-nfs` security group to the FSx for Lustre ENIs 3. **Optional**: If **SharedFSx** is set to **True**, creates the partition *shared* in the FSx for Lustre volume, and associates it to the Studio domain -![SageMaker Studio with Hyperpod integration](/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/07-fsx-shared.png) +![SageMaker Studio with Hyperpod integration](/architectures/sagemaker-hyperpod-slurm/slurm-studio/media/07-fsx-shared.png) 4. If **SharedFSx** is set to **False**, a Lambda function that: 1. Creates the partition */{user_profile_name}*, and associates it to the Studio user profile 5. If **SharedFSx** is set to **False**, an Event bridge rule that invokes the previously defined Lambda function each time a new user is created. -![SageMaker Studio with Hyperpod integration](/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/08-fsx-partitioned.png) +![SageMaker Studio with Hyperpod integration](/architectures/sagemaker-hyperpod-slurm/slurm-studio/media/08-fsx-partitioned.png) You can deploy the stack via - [
 1-Click Deploy 🚀 
](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/quickcreate?templateURL=https://awsome-distributed-training.s3.amazonaws.com/templates/studio-slurm.yaml) + [
 1-Click Deploy 🚀 
](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/quickcreate?templateURL=https://awsome-distributed-ai.s3.amazonaws.com/templates/studio-slurm.yaml) The CloudFormation template requires the following parameters: @@ -88,12 +88,12 @@ As an admin user, once your SageMaker Studio Domain is provisioned, you may go i > [!NOTE] > This step *DOES NOT* assume that you already have a Studio Domain. To create one, check out the next section titled **"SageMaker Studio Domain Setup"**. -![SageMaker Studio with Hyperpod integration](/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/09-studio-user.png) +![SageMaker Studio with Hyperpod integration](/architectures/sagemaker-hyperpod-slurm/slurm-studio/media/09-studio-user.png) You can now select your preferred IDE from SageMaker Studio. -![SageMaker Studio with Hyperpod integration](/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/02-studio-home.png) +![SageMaker Studio with Hyperpod integration](/architectures/sagemaker-hyperpod-slurm/slurm-studio/media/02-studio-home.png) For the purpose of this workshop, we are going to create a Code Editor environment. @@ -107,13 +107,13 @@ From the top-left menu: 6. From the **Lifecycle configuration** dropdown menu, select the available lifecycle configuration -![SageMaker Studio with Hyperpod integration](/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/03-codeditor-fsx.png) +![SageMaker Studio with Hyperpod integration](/architectures/sagemaker-hyperpod-slurm/slurm-studio/media/03-codeditor-fsx.png) Click on **Run Space**. Wait until the space is created, then click **Open Code Editor** To verify that your file system was mounted, you can check if you have a path mounted in the Code Editor space `custom-file-system/fsx_lustre/`: -![SageMaker Studio with Hyperpod integration](/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/10-filesystem-check.png) +![SageMaker Studio with Hyperpod integration](/architectures/sagemaker-hyperpod-slurm/slurm-studio/media/10-filesystem-check.png) You can also run: @@ -292,8 +292,8 @@ The solution automates the process of setting up the SLURM client configuration, ## Repository Structure ``` . -└── 1.architectures/ - └── 5.sagemaker-hyperpod/ +└── architectures/ + └── sagemaker-hyperpod-slurm/ └── slurm-studio/ ├── slurm_lifecycle.sh # Automated script for SLURM environment setup on SageMaker Studio └── studio-slurm.yaml # CloudFormation template for SageMaker Studio Domain setup diff --git a/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/01-studio-hyperpod-architecture.png b/architectures/sagemaker-hyperpod-slurm/slurm-studio/media/01-studio-hyperpod-architecture.png similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/slurm-studio/media/01-studio-hyperpod-architecture.png rename to architectures/sagemaker-hyperpod-slurm/slurm-studio/media/01-studio-hyperpod-architecture.png diff --git a/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/02-studio-home.png b/architectures/sagemaker-hyperpod-slurm/slurm-studio/media/02-studio-home.png similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/slurm-studio/media/02-studio-home.png rename to architectures/sagemaker-hyperpod-slurm/slurm-studio/media/02-studio-home.png diff --git a/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/03-codeditor-fsx.png b/architectures/sagemaker-hyperpod-slurm/slurm-studio/media/03-codeditor-fsx.png similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/slurm-studio/media/03-codeditor-fsx.png rename to architectures/sagemaker-hyperpod-slurm/slurm-studio/media/03-codeditor-fsx.png diff --git a/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/07-fsx-shared.png b/architectures/sagemaker-hyperpod-slurm/slurm-studio/media/07-fsx-shared.png similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/slurm-studio/media/07-fsx-shared.png rename to architectures/sagemaker-hyperpod-slurm/slurm-studio/media/07-fsx-shared.png diff --git a/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/08-fsx-partitioned.png b/architectures/sagemaker-hyperpod-slurm/slurm-studio/media/08-fsx-partitioned.png similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/slurm-studio/media/08-fsx-partitioned.png rename to architectures/sagemaker-hyperpod-slurm/slurm-studio/media/08-fsx-partitioned.png diff --git a/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/09-studio-user.png b/architectures/sagemaker-hyperpod-slurm/slurm-studio/media/09-studio-user.png similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/slurm-studio/media/09-studio-user.png rename to architectures/sagemaker-hyperpod-slurm/slurm-studio/media/09-studio-user.png diff --git a/1.architectures/5.sagemaker-hyperpod/slurm-studio/media/10-filesystem-check.png b/architectures/sagemaker-hyperpod-slurm/slurm-studio/media/10-filesystem-check.png similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/slurm-studio/media/10-filesystem-check.png rename to architectures/sagemaker-hyperpod-slurm/slurm-studio/media/10-filesystem-check.png diff --git a/1.architectures/5.sagemaker-hyperpod/slurm-studio/slurm_lifecycle.sh b/architectures/sagemaker-hyperpod-slurm/slurm-studio/slurm_lifecycle.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/slurm-studio/slurm_lifecycle.sh rename to architectures/sagemaker-hyperpod-slurm/slurm-studio/slurm_lifecycle.sh diff --git a/1.architectures/5.sagemaker-hyperpod/slurm-studio/studio-slurm.yaml b/architectures/sagemaker-hyperpod-slurm/slurm-studio/studio-slurm.yaml similarity index 99% rename from 1.architectures/5.sagemaker-hyperpod/slurm-studio/studio-slurm.yaml rename to architectures/sagemaker-hyperpod-slurm/slurm-studio/studio-slurm.yaml index a7933743e..aa24239b3 100644 --- a/1.architectures/5.sagemaker-hyperpod/slurm-studio/studio-slurm.yaml +++ b/architectures/sagemaker-hyperpod-slurm/slurm-studio/studio-slurm.yaml @@ -146,7 +146,7 @@ Resources: # Download and execute script export TEMP_SCRIPT="/tmp/slurm_setup.sh" curl -sL \ - "https://raw.githubusercontent.com/awslabs/awsome-distributed-training/refs/heads/main/1.architectures/5.sagemaker-hyperpod/slurm-studio/slurm_lifecycle.sh" \ + "https://raw.githubusercontent.com/awslabs/awsome-distributed-ai/refs/heads/main/architectures/sagemaker-hyperpod-slurm/slurm-studio/slurm_lifecycle.sh" \ -o "\$TEMP_SCRIPT" chmod +x "\$TEMP_SCRIPT" @@ -213,7 +213,7 @@ Resources: # Download and execute script export TEMP_SCRIPT="/tmp/slurm_setup.sh" curl -sL \ - "https://raw.githubusercontent.com/awslabs/awsome-distributed-training/refs/heads/main/1.architectures/5.sagemaker-hyperpod/slurm-studio/slurm_lifecycle.sh" \ + "https://raw.githubusercontent.com/awslabs/awsome-distributed-ai/refs/heads/main/architectures/sagemaker-hyperpod-slurm/slurm-studio/slurm_lifecycle.sh" \ -o "\$TEMP_SCRIPT" chmod +x "\$TEMP_SCRIPT" diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/README.md b/architectures/sagemaker-hyperpod-slurm/terraform-modules/README.md similarity index 95% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/README.md rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/README.md index 5fcbe6f01..f3d84e74b 100644 --- a/1.architectures/5.sagemaker-hyperpod/terraform-modules/README.md +++ b/architectures/sagemaker-hyperpod-slurm/terraform-modules/README.md @@ -17,8 +17,8 @@ The Terraform modules create: 1. **Clone and Navigate** ```bash - git clone https://github.com/awslabs/awsome-distributed-training.git - cd awsome-distributed-training/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf + git clone https://github.com/awslabs/awsome-distributed-ai.git + cd awsome-distributed-ai/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf ``` 2. **Customize Configuration** diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/.gitignore b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/.gitignore similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/.gitignore rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/.gitignore diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/easy-ssh.sh b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/easy-ssh.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/easy-ssh.sh rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/easy-ssh.sh diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/main.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/main.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/main.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/main.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/main.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/main.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/main.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/main.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/outputs.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/outputs.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/outputs.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/outputs.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/variables.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/variables.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/variables.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/fsx_lustre/variables.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/main.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/main.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/main.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/main.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/outputs.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/outputs.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/outputs.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/outputs.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/variables.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/variables.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/variables.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/fsx_openzfs/variables.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/hyperpod_cluster/main.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/hyperpod_cluster/main.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/hyperpod_cluster/main.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/hyperpod_cluster/main.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/hyperpod_cluster/outputs.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/hyperpod_cluster/outputs.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/hyperpod_cluster/outputs.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/hyperpod_cluster/outputs.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/hyperpod_cluster/variables.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/hyperpod_cluster/variables.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/hyperpod_cluster/variables.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/hyperpod_cluster/variables.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/lifecycle_script/main.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/lifecycle_script/main.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/lifecycle_script/main.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/lifecycle_script/main.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/lifecycle_script/outputs.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/lifecycle_script/outputs.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/lifecycle_script/outputs.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/lifecycle_script/outputs.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/lifecycle_script/variables.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/lifecycle_script/variables.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/lifecycle_script/variables.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/lifecycle_script/variables.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/private_subnet/main.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/private_subnet/main.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/private_subnet/main.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/private_subnet/main.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/private_subnet/outputs.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/private_subnet/outputs.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/private_subnet/outputs.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/private_subnet/outputs.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/private_subnet/variables.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/private_subnet/variables.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/private_subnet/variables.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/private_subnet/variables.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/main.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/main.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/main.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/main.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/outputs.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/outputs.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/outputs.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/outputs.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/variables.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/variables.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/variables.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/s3_bucket/variables.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/main.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/main.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/main.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/main.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/outputs.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/outputs.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/outputs.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/outputs.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/variables.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/variables.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/variables.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/s3_endpoint/variables.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/sagemaker_iam_role/main.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/sagemaker_iam_role/main.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/sagemaker_iam_role/main.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/sagemaker_iam_role/main.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/sagemaker_iam_role/outputs.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/sagemaker_iam_role/outputs.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/sagemaker_iam_role/outputs.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/sagemaker_iam_role/outputs.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/sagemaker_iam_role/variables.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/sagemaker_iam_role/variables.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/sagemaker_iam_role/variables.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/sagemaker_iam_role/variables.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/security_group/main.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/security_group/main.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/security_group/main.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/security_group/main.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/security_group/outputs.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/security_group/outputs.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/security_group/outputs.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/security_group/outputs.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/security_group/variables.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/security_group/variables.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/security_group/variables.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/security_group/variables.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/vpc/main.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/vpc/main.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/vpc/main.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/vpc/main.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/vpc/outputs.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/vpc/outputs.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/vpc/outputs.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/vpc/outputs.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/vpc/variables.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/vpc/variables.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/modules/vpc/variables.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/modules/vpc/variables.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/outputs.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/outputs.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/outputs.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/outputs.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/providers.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/providers.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/providers.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/providers.tf diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/terraform.tfvars.example b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/terraform.tfvars.example similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/terraform.tfvars.example rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/terraform.tfvars.example diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/terraform_outputs.sh b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/terraform_outputs.sh similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/terraform_outputs.sh rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/terraform_outputs.sh diff --git a/1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/variables.tf b/architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/variables.tf similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/terraform-modules/hyperpod-slurm-tf/variables.tf rename to architectures/sagemaker-hyperpod-slurm/terraform-modules/hyperpod-slurm-tf/variables.tf diff --git a/1.architectures/5.sagemaker-hyperpod/tools/README.md b/architectures/sagemaker-hyperpod-slurm/tools/README.md similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/tools/README.md rename to architectures/sagemaker-hyperpod-slurm/tools/README.md diff --git a/1.architectures/5.sagemaker-hyperpod/tools/dump_cluster_nodes_info.py b/architectures/sagemaker-hyperpod-slurm/tools/dump_cluster_nodes_info.py similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/tools/dump_cluster_nodes_info.py rename to architectures/sagemaker-hyperpod-slurm/tools/dump_cluster_nodes_info.py diff --git a/1.architectures/5.sagemaker-hyperpod/validate-config.py b/architectures/sagemaker-hyperpod-slurm/validate-config.py similarity index 100% rename from 1.architectures/5.sagemaker-hyperpod/validate-config.py rename to architectures/sagemaker-hyperpod-slurm/validate-config.py diff --git a/1.architectures/1.vpc_network/1.vpc-multi-az.yaml b/architectures/vpc_network/1.vpc-multi-az.yaml similarity index 100% rename from 1.architectures/1.vpc_network/1.vpc-multi-az.yaml rename to architectures/vpc_network/1.vpc-multi-az.yaml diff --git a/1.architectures/1.vpc_network/2.vpc-one-az.yaml b/architectures/vpc_network/2.vpc-one-az.yaml similarity index 100% rename from 1.architectures/1.vpc_network/2.vpc-one-az.yaml rename to architectures/vpc_network/2.vpc-one-az.yaml diff --git a/1.architectures/1.vpc_network/README.md b/architectures/vpc_network/README.md similarity index 97% rename from 1.architectures/1.vpc_network/README.md rename to architectures/vpc_network/README.md index 896f3e2f9..79ac75679 100644 --- a/1.architectures/1.vpc_network/README.md +++ b/architectures/vpc_network/README.md @@ -48,7 +48,7 @@ aws cloudformation create-stack --stack-name vpc-stack-ml\ #### Architecture Diagram - + ### 2. Template VPC One AZs @@ -83,4 +83,4 @@ aws cloudformation create-stack --stack-name vpc-stack-ml\ #### Architecture Diagram - + diff --git a/1.architectures/1.vpc_network/status.sh b/architectures/vpc_network/status.sh similarity index 100% rename from 1.architectures/1.vpc_network/status.sh rename to architectures/vpc_network/status.sh diff --git a/0.docs/EnableIdentityCenter.png b/assets/EnableIdentityCenter.png similarity index 100% rename from 0.docs/EnableIdentityCenter.png rename to assets/EnableIdentityCenter.png diff --git a/0.docs/IdentityCenterSetup1.png b/assets/IdentityCenterSetup1.png similarity index 100% rename from 0.docs/IdentityCenterSetup1.png rename to assets/IdentityCenterSetup1.png diff --git a/0.docs/IdentityCenterSetup10.png b/assets/IdentityCenterSetup10.png similarity index 100% rename from 0.docs/IdentityCenterSetup10.png rename to assets/IdentityCenterSetup10.png diff --git a/0.docs/IdentityCenterSetup2.png b/assets/IdentityCenterSetup2.png similarity index 100% rename from 0.docs/IdentityCenterSetup2.png rename to assets/IdentityCenterSetup2.png diff --git a/0.docs/IdentityCenterSetup3.png b/assets/IdentityCenterSetup3.png similarity index 100% rename from 0.docs/IdentityCenterSetup3.png rename to assets/IdentityCenterSetup3.png diff --git a/0.docs/IdentityCenterSetup4.png b/assets/IdentityCenterSetup4.png similarity index 100% rename from 0.docs/IdentityCenterSetup4.png rename to assets/IdentityCenterSetup4.png diff --git a/0.docs/IdentityCenterSetup5.png b/assets/IdentityCenterSetup5.png similarity index 100% rename from 0.docs/IdentityCenterSetup5.png rename to assets/IdentityCenterSetup5.png diff --git a/0.docs/IdentityCenterSetup6.png b/assets/IdentityCenterSetup6.png similarity index 100% rename from 0.docs/IdentityCenterSetup6.png rename to assets/IdentityCenterSetup6.png diff --git a/0.docs/IdentityCenterSetup7.png b/assets/IdentityCenterSetup7.png similarity index 100% rename from 0.docs/IdentityCenterSetup7.png rename to assets/IdentityCenterSetup7.png diff --git a/0.docs/IdentityCenterSetup8.png b/assets/IdentityCenterSetup8.png similarity index 100% rename from 0.docs/IdentityCenterSetup8.png rename to assets/IdentityCenterSetup8.png diff --git a/0.docs/IdentityCenterSetup9.png b/assets/IdentityCenterSetup9.png similarity index 100% rename from 0.docs/IdentityCenterSetup9.png rename to assets/IdentityCenterSetup9.png diff --git a/0.docs/batch-arch.png b/assets/batch-arch.png similarity index 100% rename from 0.docs/batch-arch.png rename to assets/batch-arch.png diff --git a/0.docs/core-infra-architecture.png b/assets/core-infra-architecture.png similarity index 100% rename from 0.docs/core-infra-architecture.png rename to assets/core-infra-architecture.png diff --git a/0.docs/deploy_prometheus_grafana_cfn.png b/assets/deploy_prometheus_grafana_cfn.png similarity index 100% rename from 0.docs/deploy_prometheus_grafana_cfn.png rename to assets/deploy_prometheus_grafana_cfn.png diff --git a/0.docs/deployment diagrams.png b/assets/deployment diagrams.png similarity index 100% rename from 0.docs/deployment diagrams.png rename to assets/deployment diagrams.png diff --git a/0.docs/diagrams-omnigraffle.graffle b/assets/diagrams-omnigraffle.graffle similarity index 100% rename from 0.docs/diagrams-omnigraffle.graffle rename to assets/diagrams-omnigraffle.graffle diff --git a/0.docs/diagrams-templates.pptx b/assets/diagrams-templates.pptx similarity index 100% rename from 0.docs/diagrams-templates.pptx rename to assets/diagrams-templates.pptx diff --git a/0.docs/eks-model-training-multi-az.drawio b/assets/eks-model-training-multi-az.drawio similarity index 100% rename from 0.docs/eks-model-training-multi-az.drawio rename to assets/eks-model-training-multi-az.drawio diff --git a/0.docs/eks-model-training-single-az.png b/assets/eks-model-training-single-az.png similarity index 100% rename from 0.docs/eks-model-training-single-az.png rename to assets/eks-model-training-single-az.png diff --git a/0.docs/fsx-lustre-template.png b/assets/fsx-lustre-template.png similarity index 100% rename from 0.docs/fsx-lustre-template.png rename to assets/fsx-lustre-template.png diff --git a/0.docs/observability_architecture.png b/assets/observability_architecture.png similarity index 100% rename from 0.docs/observability_architecture.png rename to assets/observability_architecture.png diff --git a/0.docs/parallelcluster-arch-diagram.png b/assets/parallelcluster-arch-diagram.png similarity index 100% rename from 0.docs/parallelcluster-arch-diagram.png rename to assets/parallelcluster-arch-diagram.png diff --git a/0.docs/parallelcluster-prerequisites-cfn.png b/assets/parallelcluster-prerequisites-cfn.png similarity index 100% rename from 0.docs/parallelcluster-prerequisites-cfn.png rename to assets/parallelcluster-prerequisites-cfn.png diff --git a/0.docs/ssm-connect-user.png b/assets/ssm-connect-user.png similarity index 100% rename from 0.docs/ssm-connect-user.png rename to assets/ssm-connect-user.png diff --git a/0.docs/ssm-connect.png b/assets/ssm-connect.png similarity index 100% rename from 0.docs/ssm-connect.png rename to assets/ssm-connect.png diff --git a/0.docs/vpc-all-az.png b/assets/vpc-all-az.png similarity index 100% rename from 0.docs/vpc-all-az.png rename to assets/vpc-all-az.png diff --git a/0.docs/vpc-one-az.png b/assets/vpc-one-az.png similarity index 100% rename from 0.docs/vpc-one-az.png rename to assets/vpc-one-az.png diff --git a/0.docs/vpc-template.png b/assets/vpc-template.png similarity index 100% rename from 0.docs/vpc-template.png rename to assets/vpc-template.png diff --git a/examples/inference/README.md b/examples/inference/README.md new file mode 100644 index 000000000..df9c77ac1 --- /dev/null +++ b/examples/inference/README.md @@ -0,0 +1,11 @@ +# Inference Examples + +Framework-centric inference engine examples, organized by serving engine. + +| Engine | Example | Description | +|---|---|---| +| [`vllm`](./vllm) | [`dsv3-uccl-nixl`](./vllm/dsv3-uccl-nixl) | DeepSeek-V3 disaggregated (prefill/decode) inference with vLLM, UCCL-EP, and NIXL on EKS | + +More engines (SGLang, TRT-LLM, NIM, Dynamo, Ray Serve, …) are planned, including +content to be merged from [`aws-samples/awsome-inference`](https://github.com/aws-samples/awsome-inference) +(see issue #1056). diff --git a/3.test_cases/pytorch/vllm/README.md b/examples/inference/vllm/README.md similarity index 100% rename from 3.test_cases/pytorch/vllm/README.md rename to examples/inference/vllm/README.md diff --git a/3.test_cases/pytorch/vllm/dsv3-uccl-nixl/.gitignore b/examples/inference/vllm/dsv3-uccl-nixl/.gitignore similarity index 100% rename from 3.test_cases/pytorch/vllm/dsv3-uccl-nixl/.gitignore rename to examples/inference/vllm/dsv3-uccl-nixl/.gitignore diff --git a/3.test_cases/pytorch/vllm/dsv3-uccl-nixl/Dockerfile b/examples/inference/vllm/dsv3-uccl-nixl/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/vllm/dsv3-uccl-nixl/Dockerfile rename to examples/inference/vllm/dsv3-uccl-nixl/Dockerfile diff --git a/3.test_cases/pytorch/vllm/dsv3-uccl-nixl/README.md b/examples/inference/vllm/dsv3-uccl-nixl/README.md similarity index 99% rename from 3.test_cases/pytorch/vllm/dsv3-uccl-nixl/README.md rename to examples/inference/vllm/dsv3-uccl-nixl/README.md index e182a0acc..b84cfa6d3 100644 --- a/3.test_cases/pytorch/vllm/dsv3-uccl-nixl/README.md +++ b/examples/inference/vllm/dsv3-uccl-nixl/README.md @@ -146,7 +146,7 @@ dsv3-uccl-nixl/ ### 1. Configure environment ```bash -cd 3.test_cases/pytorch/vllm/dsv3-uccl-nixl +cd examples/inference/vllm/dsv3-uccl-nixl cp setup/env_vars.example setup/env_vars $EDITOR setup/env_vars # replace every REPLACE_ME placeholder grep REPLACE_ME setup/env_vars # should print nothing when fully filled diff --git a/3.test_cases/pytorch/vllm/dsv3-uccl-nixl/manifests/decode.yaml b/examples/inference/vllm/dsv3-uccl-nixl/manifests/decode.yaml similarity index 100% rename from 3.test_cases/pytorch/vllm/dsv3-uccl-nixl/manifests/decode.yaml rename to examples/inference/vllm/dsv3-uccl-nixl/manifests/decode.yaml diff --git a/3.test_cases/pytorch/vllm/dsv3-uccl-nixl/manifests/prefill.yaml b/examples/inference/vllm/dsv3-uccl-nixl/manifests/prefill.yaml similarity index 100% rename from 3.test_cases/pytorch/vllm/dsv3-uccl-nixl/manifests/prefill.yaml rename to examples/inference/vllm/dsv3-uccl-nixl/manifests/prefill.yaml diff --git a/3.test_cases/pytorch/vllm/dsv3-uccl-nixl/manifests/proxy.yaml b/examples/inference/vllm/dsv3-uccl-nixl/manifests/proxy.yaml similarity index 100% rename from 3.test_cases/pytorch/vllm/dsv3-uccl-nixl/manifests/proxy.yaml rename to examples/inference/vllm/dsv3-uccl-nixl/manifests/proxy.yaml diff --git a/3.test_cases/pytorch/vllm/dsv3-uccl-nixl/manifests/unified.yaml b/examples/inference/vllm/dsv3-uccl-nixl/manifests/unified.yaml similarity index 100% rename from 3.test_cases/pytorch/vllm/dsv3-uccl-nixl/manifests/unified.yaml rename to examples/inference/vllm/dsv3-uccl-nixl/manifests/unified.yaml diff --git a/3.test_cases/pytorch/vllm/dsv3-uccl-nixl/recipe/benchmark.sh b/examples/inference/vllm/dsv3-uccl-nixl/recipe/benchmark.sh similarity index 100% rename from 3.test_cases/pytorch/vllm/dsv3-uccl-nixl/recipe/benchmark.sh rename to examples/inference/vllm/dsv3-uccl-nixl/recipe/benchmark.sh diff --git a/3.test_cases/pytorch/vllm/dsv3-uccl-nixl/recipe/deploy.sh b/examples/inference/vllm/dsv3-uccl-nixl/recipe/deploy.sh similarity index 100% rename from 3.test_cases/pytorch/vllm/dsv3-uccl-nixl/recipe/deploy.sh rename to examples/inference/vllm/dsv3-uccl-nixl/recipe/deploy.sh diff --git a/3.test_cases/pytorch/vllm/dsv3-uccl-nixl/recipe/teardown.sh b/examples/inference/vllm/dsv3-uccl-nixl/recipe/teardown.sh similarity index 100% rename from 3.test_cases/pytorch/vllm/dsv3-uccl-nixl/recipe/teardown.sh rename to examples/inference/vllm/dsv3-uccl-nixl/recipe/teardown.sh diff --git a/3.test_cases/pytorch/vllm/dsv3-uccl-nixl/setup/build-push.sh b/examples/inference/vllm/dsv3-uccl-nixl/setup/build-push.sh similarity index 100% rename from 3.test_cases/pytorch/vllm/dsv3-uccl-nixl/setup/build-push.sh rename to examples/inference/vllm/dsv3-uccl-nixl/setup/build-push.sh diff --git a/3.test_cases/pytorch/vllm/dsv3-uccl-nixl/setup/env_vars.example b/examples/inference/vllm/dsv3-uccl-nixl/setup/env_vars.example similarity index 100% rename from 3.test_cases/pytorch/vllm/dsv3-uccl-nixl/setup/env_vars.example rename to examples/inference/vllm/dsv3-uccl-nixl/setup/env_vars.example diff --git a/3.test_cases/pytorch/vllm/dsv3-uccl-nixl/setup/install-prereqs.sh b/examples/inference/vllm/dsv3-uccl-nixl/setup/install-prereqs.sh similarity index 100% rename from 3.test_cases/pytorch/vllm/dsv3-uccl-nixl/setup/install-prereqs.sh rename to examples/inference/vllm/dsv3-uccl-nixl/setup/install-prereqs.sh diff --git a/3.test_cases/megatron/bionemo/0.Dockerfile b/examples/training/bionemo/0.Dockerfile similarity index 100% rename from 3.test_cases/megatron/bionemo/0.Dockerfile rename to examples/training/bionemo/0.Dockerfile diff --git a/3.test_cases/megatron/bionemo/1.uniref50.slurm b/examples/training/bionemo/1.uniref50.slurm similarity index 100% rename from 3.test_cases/megatron/bionemo/1.uniref50.slurm rename to examples/training/bionemo/1.uniref50.slurm diff --git a/3.test_cases/megatron/bionemo/2.esm1nv_pretrain.slurm b/examples/training/bionemo/2.esm1nv_pretrain.slurm similarity index 100% rename from 3.test_cases/megatron/bionemo/2.esm1nv_pretrain.slurm rename to examples/training/bionemo/2.esm1nv_pretrain.slurm diff --git a/3.test_cases/megatron/bionemo/README.md b/examples/training/bionemo/README.md similarity index 97% rename from 3.test_cases/megatron/bionemo/README.md rename to examples/training/bionemo/README.md index be54a5e7e..a07ba33a9 100644 --- a/3.test_cases/megatron/bionemo/README.md +++ b/examples/training/bionemo/README.md @@ -59,8 +59,8 @@ export DATASET_PATH=/fsx/ ```bash cd /apps/ -git clone https://github.com/awslabs/awsome-distributed-training.git -cp -r /apps/awsome-distributed-training/3.test_cases/14.bionemo/* ./apps/ +git clone https://github.com/awslabs/awsome-distributed-ai.git +cp -r /apps/awsome-distributed-ai/examples/14.bionemo/* ./apps/ ``` ## 2. Pull Image @@ -93,7 +93,7 @@ All package versions in the above `requirements.txt` file is recommended from Nv ## 4. Build customized docker image To achieve target performance of Nemo-Multimodal with EFA on P5 and P4de instances, we provide a customized -`3.test_cases/14.nemo-multimodal/0.Dockerfile` and we can build a image like below: +`examples/14.nemo-multimodal/0.Dockerfile` and we can build a image like below: ``` docker build -t ${DOCKER_IMAGE_NAME}:${TAG} -f 0.Dockerfile . diff --git a/3.test_cases/megatron/bionemo/bionemo_2.5/Dockerfile b/examples/training/bionemo/bionemo_2.5/Dockerfile similarity index 100% rename from 3.test_cases/megatron/bionemo/bionemo_2.5/Dockerfile rename to examples/training/bionemo/bionemo_2.5/Dockerfile diff --git a/3.test_cases/megatron/bionemo/bionemo_2.5/build.sh b/examples/training/bionemo/bionemo_2.5/build.sh similarity index 100% rename from 3.test_cases/megatron/bionemo/bionemo_2.5/build.sh rename to examples/training/bionemo/bionemo_2.5/build.sh diff --git a/3.test_cases/megatron/bionemo/bionemo_2.5/enroot.sh b/examples/training/bionemo/bionemo_2.5/enroot.sh similarity index 100% rename from 3.test_cases/megatron/bionemo/bionemo_2.5/enroot.sh rename to examples/training/bionemo/bionemo_2.5/enroot.sh diff --git a/3.test_cases/megatron/bionemo/bionemo_2.5/get-data.sh b/examples/training/bionemo/bionemo_2.5/get-data.sh similarity index 100% rename from 3.test_cases/megatron/bionemo/bionemo_2.5/get-data.sh rename to examples/training/bionemo/bionemo_2.5/get-data.sh diff --git a/3.test_cases/megatron/bionemo/bionemo_2.5/train-esm.sbatch b/examples/training/bionemo/bionemo_2.5/train-esm.sbatch similarity index 100% rename from 3.test_cases/megatron/bionemo/bionemo_2.5/train-esm.sbatch rename to examples/training/bionemo/bionemo_2.5/train-esm.sbatch diff --git a/3.test_cases/megatron/bionemo/prepare_uniref50.py b/examples/training/bionemo/prepare_uniref50.py similarity index 100% rename from 3.test_cases/megatron/bionemo/prepare_uniref50.py rename to examples/training/bionemo/prepare_uniref50.py diff --git a/3.test_cases/megatron/bionemo/requirements.txt b/examples/training/bionemo/requirements.txt similarity index 100% rename from 3.test_cases/megatron/bionemo/requirements.txt rename to examples/training/bionemo/requirements.txt diff --git a/3.test_cases/pytorch/ddp/.gitignore b/examples/training/ddp/.gitignore similarity index 100% rename from 3.test_cases/pytorch/ddp/.gitignore rename to examples/training/ddp/.gitignore diff --git a/3.test_cases/pytorch/ddp/Dockerfile b/examples/training/ddp/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/ddp/Dockerfile rename to examples/training/ddp/Dockerfile diff --git a/3.test_cases/pytorch/ddp/README.md b/examples/training/ddp/README.md similarity index 100% rename from 3.test_cases/pytorch/ddp/README.md rename to examples/training/ddp/README.md diff --git a/3.test_cases/pytorch/ddp/ddp.py b/examples/training/ddp/ddp.py similarity index 100% rename from 3.test_cases/pytorch/ddp/ddp.py rename to examples/training/ddp/ddp.py diff --git a/3.test_cases/pytorch/ddp/kubernetes/README.md b/examples/training/ddp/kubernetes/README.md similarity index 98% rename from 3.test_cases/pytorch/ddp/kubernetes/README.md rename to examples/training/ddp/kubernetes/README.md index 1452570f5..9fa5688c0 100644 --- a/3.test_cases/pytorch/ddp/kubernetes/README.md +++ b/examples/training/ddp/kubernetes/README.md @@ -5,7 +5,7 @@ The guide assumes that you have the following: * An Amazon FSx for Lustre persistent volume claim named `fsx-pv`, you can use an example from [here](https://github.com/aws-samples/aws-do-eks/tree/main/Container-Root/eks/deployment/csi/fsx), if you need to create one. * Docker -We recommend that you setup a Kubernetes cluster using the templates in the architectures [directory](../../1.architectures). +We recommend that you setup a Kubernetes cluster using the templates in the architectures [directory](../../../../architectures). ## 3. Submit training job using container diff --git a/3.test_cases/pytorch/ddp/kubernetes/ddp-custom-container.yaml-template b/examples/training/ddp/kubernetes/ddp-custom-container.yaml-template similarity index 100% rename from 3.test_cases/pytorch/ddp/kubernetes/ddp-custom-container.yaml-template rename to examples/training/ddp/kubernetes/ddp-custom-container.yaml-template diff --git a/3.test_cases/pytorch/ddp/slurm/0.create-venv.sh b/examples/training/ddp/slurm/0.create-venv.sh similarity index 100% rename from 3.test_cases/pytorch/ddp/slurm/0.create-venv.sh rename to examples/training/ddp/slurm/0.create-venv.sh diff --git a/3.test_cases/pytorch/ddp/slurm/1.venv-train.sbatch b/examples/training/ddp/slurm/1.venv-train.sbatch similarity index 100% rename from 3.test_cases/pytorch/ddp/slurm/1.venv-train.sbatch rename to examples/training/ddp/slurm/1.venv-train.sbatch diff --git a/3.test_cases/pytorch/ddp/slurm/2.create-enroot-image.sh b/examples/training/ddp/slurm/2.create-enroot-image.sh similarity index 100% rename from 3.test_cases/pytorch/ddp/slurm/2.create-enroot-image.sh rename to examples/training/ddp/slurm/2.create-enroot-image.sh diff --git a/3.test_cases/pytorch/ddp/slurm/3.container-train.sbatch b/examples/training/ddp/slurm/3.container-train.sbatch similarity index 100% rename from 3.test_cases/pytorch/ddp/slurm/3.container-train.sbatch rename to examples/training/ddp/slurm/3.container-train.sbatch diff --git a/3.test_cases/pytorch/ddp/slurm/README.md b/examples/training/ddp/slurm/README.md similarity index 99% rename from 3.test_cases/pytorch/ddp/slurm/README.md rename to examples/training/ddp/slurm/README.md index d8c4a1c5f..f97070b09 100644 --- a/3.test_cases/pytorch/ddp/slurm/README.md +++ b/examples/training/ddp/slurm/README.md @@ -6,7 +6,7 @@ The guide assumes that you have the following: * An FSx for Lustre filesystem mounted on `/fsx`. * `enroot` if you want to run the container example. -We recommend that you setup a Slurm cluster using the templates in the architectures [directory](../../1.architectures). +We recommend that you setup a Slurm cluster using the templates in the architectures [directory](../../../../architectures). ## 2. Submit training job using virtual environment on Slurm diff --git a/3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile b/examples/training/deepspeed/0.deepspeed.dockerfile similarity index 100% rename from 3.test_cases/pytorch/deepspeed/0.deepspeed.dockerfile rename to examples/training/deepspeed/0.deepspeed.dockerfile diff --git a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch b/examples/training/deepspeed/1.build-image.sbatch similarity index 100% rename from 3.test_cases/pytorch/deepspeed/1.build-image.sbatch rename to examples/training/deepspeed/1.build-image.sbatch diff --git a/3.test_cases/pytorch/deepspeed/Makefile b/examples/training/deepspeed/Makefile similarity index 100% rename from 3.test_cases/pytorch/deepspeed/Makefile rename to examples/training/deepspeed/Makefile diff --git a/3.test_cases/pytorch/deepspeed/README.md b/examples/training/deepspeed/README.md similarity index 99% rename from 3.test_cases/pytorch/deepspeed/README.md rename to examples/training/deepspeed/README.md index 82e34d46a..7904ce5e3 100644 --- a/3.test_cases/pytorch/deepspeed/README.md +++ b/examples/training/deepspeed/README.md @@ -12,7 +12,7 @@ ## Prerequisites -- A functional Slurm cluster on AWS. We recommend [SageMaker HyperPod](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod.html) or the templates in the [architectures directory](../../1.architectures). +- A functional Slurm cluster on AWS. We recommend [SageMaker HyperPod](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod.html) or the templates in the [architectures directory](../../../architectures). - [Docker](https://docs.docker.com/engine/install/), [Pyxis](https://github.com/NVIDIA/pyxis), and [Enroot](https://github.com/NVIDIA/enroot) installed on compute nodes. - An [FSx for Lustre](https://docs.aws.amazon.com/fsx/latest/LustreGuide/what-is.html) filesystem mounted on `/fsx`. - NVIDIA GPU instances with [EFA networking](https://aws.amazon.com/hpc/efa/) (B200, H100, A100, etc.). diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/.gitignore b/examples/training/deepspeed/examples_megatron_deepspeed/.gitignore similarity index 100% rename from 3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/.gitignore rename to examples/training/deepspeed/examples_megatron_deepspeed/.gitignore diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/README.md b/examples/training/deepspeed/examples_megatron_deepspeed/README.md similarity index 100% rename from 3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/README.md rename to examples/training/deepspeed/examples_megatron_deepspeed/README.md diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/.gitignore b/examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/.gitignore similarity index 100% rename from 3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/.gitignore rename to examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/.gitignore diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/1.convert-weights-to-hf.sbatch b/examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/1.convert-weights-to-hf.sbatch similarity index 100% rename from 3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/1.convert-weights-to-hf.sbatch rename to examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/1.convert-weights-to-hf.sbatch diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/2.convert-weights-to-mega-ds.sh b/examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/2.convert-weights-to-mega-ds.sh similarity index 100% rename from 3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/2.convert-weights-to-mega-ds.sh rename to examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/2.convert-weights-to-mega-ds.sh diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh b/examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh similarity index 100% rename from 3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh rename to examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md b/examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md similarity index 100% rename from 3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md rename to examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/configs/ds_config.json b/examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/configs/ds_config.json similarity index 100% rename from 3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/configs/ds_config.json rename to examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/configs/ds_config.json diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/convert-weights-hf-to-megatron-deepspeed.sh b/examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/convert-weights-hf-to-megatron-deepspeed.sh similarity index 100% rename from 3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/convert-weights-hf-to-megatron-deepspeed.sh rename to examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/convert-weights-hf-to-megatron-deepspeed.sh diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sbatch b/examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sbatch similarity index 100% rename from 3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sbatch rename to examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sbatch diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sh b/examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sh similarity index 100% rename from 3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sh rename to examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sh diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/src/convert_llama_weights_to_hf.py b/examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/src/convert_llama_weights_to_hf.py similarity index 100% rename from 3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/src/convert_llama_weights_to_hf.py rename to examples/training/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/src/convert_llama_weights_to_hf.py diff --git a/3.test_cases/pytorch/deepspeed/gpt/configs/ds_config_103b_template.json b/examples/training/deepspeed/gpt/configs/ds_config_103b_template.json similarity index 100% rename from 3.test_cases/pytorch/deepspeed/gpt/configs/ds_config_103b_template.json rename to examples/training/deepspeed/gpt/configs/ds_config_103b_template.json diff --git a/3.test_cases/pytorch/deepspeed/gpt/parse_results.py b/examples/training/deepspeed/gpt/parse_results.py similarity index 100% rename from 3.test_cases/pytorch/deepspeed/gpt/parse_results.py rename to examples/training/deepspeed/gpt/parse_results.py diff --git a/3.test_cases/pytorch/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch b/examples/training/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch similarity index 100% rename from 3.test_cases/pytorch/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch rename to examples/training/deepspeed/gpt/slurm/pretrain_gpt_103b.sbatch diff --git a/3.test_cases/pytorch/deepspeed/qlora/0.build-image.sh b/examples/training/deepspeed/qlora/0.build-image.sh similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/0.build-image.sh rename to examples/training/deepspeed/qlora/0.build-image.sh diff --git a/3.test_cases/pytorch/deepspeed/qlora/1.deploy-training.sh b/examples/training/deepspeed/qlora/1.deploy-training.sh similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/1.deploy-training.sh rename to examples/training/deepspeed/qlora/1.deploy-training.sh diff --git a/3.test_cases/pytorch/deepspeed/qlora/2.cleanup.sh b/examples/training/deepspeed/qlora/2.cleanup.sh similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/2.cleanup.sh rename to examples/training/deepspeed/qlora/2.cleanup.sh diff --git a/3.test_cases/pytorch/deepspeed/qlora/Dockerfile b/examples/training/deepspeed/qlora/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/Dockerfile rename to examples/training/deepspeed/qlora/Dockerfile diff --git a/3.test_cases/pytorch/deepspeed/qlora/README.md b/examples/training/deepspeed/qlora/README.md similarity index 98% rename from 3.test_cases/pytorch/deepspeed/qlora/README.md rename to examples/training/deepspeed/qlora/README.md index b393b2d64..84fabb72e 100644 --- a/3.test_cases/pytorch/deepspeed/qlora/README.md +++ b/examples/training/deepspeed/qlora/README.md @@ -66,8 +66,8 @@ kubectl logs -f qwen3-qlora-training-zero3-master-0 -n ml-training ```bash # 1. Clone the repository to the shared filesystem cd /fsx -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/3.test_cases/pytorch/deepspeed/qlora +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/examples/training/deepspeed/qlora # 2. Create virtual environment and install dependencies python3 -m venv /fsx/venvs/qwen3-qlora diff --git a/3.test_cases/pytorch/deepspeed/qlora/configs/deepspeed_zero2.json b/examples/training/deepspeed/qlora/configs/deepspeed_zero2.json similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/configs/deepspeed_zero2.json rename to examples/training/deepspeed/qlora/configs/deepspeed_zero2.json diff --git a/3.test_cases/pytorch/deepspeed/qlora/configs/deepspeed_zero3.json b/examples/training/deepspeed/qlora/configs/deepspeed_zero3.json similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/configs/deepspeed_zero3.json rename to examples/training/deepspeed/qlora/configs/deepspeed_zero3.json diff --git a/3.test_cases/pytorch/deepspeed/qlora/configs/training_config_zero2.yaml b/examples/training/deepspeed/qlora/configs/training_config_zero2.yaml similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/configs/training_config_zero2.yaml rename to examples/training/deepspeed/qlora/configs/training_config_zero2.yaml diff --git a/3.test_cases/pytorch/deepspeed/qlora/configs/training_config_zero3.yaml b/examples/training/deepspeed/qlora/configs/training_config_zero3.yaml similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/configs/training_config_zero3.yaml rename to examples/training/deepspeed/qlora/configs/training_config_zero3.yaml diff --git a/3.test_cases/pytorch/deepspeed/qlora/docs/QLORA_EXPLAINED.md b/examples/training/deepspeed/qlora/docs/QLORA_EXPLAINED.md similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/docs/QLORA_EXPLAINED.md rename to examples/training/deepspeed/qlora/docs/QLORA_EXPLAINED.md diff --git a/3.test_cases/pytorch/deepspeed/qlora/docs/TROUBLESHOOTING.md b/examples/training/deepspeed/qlora/docs/TROUBLESHOOTING.md similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/docs/TROUBLESHOOTING.md rename to examples/training/deepspeed/qlora/docs/TROUBLESHOOTING.md diff --git a/3.test_cases/pytorch/deepspeed/qlora/entrypoint.sh b/examples/training/deepspeed/qlora/entrypoint.sh similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/entrypoint.sh rename to examples/training/deepspeed/qlora/entrypoint.sh diff --git a/3.test_cases/pytorch/deepspeed/qlora/kubernetes/README.md b/examples/training/deepspeed/qlora/kubernetes/README.md similarity index 96% rename from 3.test_cases/pytorch/deepspeed/qlora/kubernetes/README.md rename to examples/training/deepspeed/qlora/kubernetes/README.md index df7bc5517..9a85766b8 100644 --- a/3.test_cases/pytorch/deepspeed/qlora/kubernetes/README.md +++ b/examples/training/deepspeed/qlora/kubernetes/README.md @@ -10,7 +10,7 @@ Instructions for deploying QLoRA training on SageMaker HyperPod with EKS orchest - NVIDIA device plugin - Health monitoring agents (node health checks, deep health checks for GPU/NCCL) - See [1.architectures/7.sagemaker-hyperpod-eks/](../../../../../1.architectures/7.sagemaker-hyperpod-eks/) for cluster setup. + See [architectures/sagemaker-hyperpod-eks/](../../../../../architectures/sagemaker-hyperpod-eks/) for cluster setup. 2. **FSx for Lustre filesystem** provisioned in the same VPC as the cluster. Note the File System ID, DNS name, and Mount name from the AWS console. @@ -150,7 +150,7 @@ MIG_PROFILE=4g.40gb NUM_GPUS=2 \ ### Enabling MIG on Nodes -MIG mode must be enabled by a cluster administrator. See the [Slurm MIG guide](../slurm/README.md#mig-configuration) for step-by-step instructions — the `nvidia-smi` commands are the same on EKS worker nodes. On HyperPod EKS, you can configure MIG in the [lifecycle scripts](../../../../../1.architectures/7.sagemaker-hyperpod-eks/) or via a DaemonSet that runs on GPU nodes. +MIG mode must be enabled by a cluster administrator. See the [Slurm MIG guide](../slurm/README.md#mig-configuration) for step-by-step instructions — the `nvidia-smi` commands are the same on EKS worker nodes. On HyperPod EKS, you can configure MIG in the [lifecycle scripts](../../../../../architectures/sagemaker-hyperpod-eks/) or via a DaemonSet that runs on GPU nodes. ## Monitor Training diff --git a/3.test_cases/pytorch/deepspeed/qlora/kubernetes/qwen3_8b-qlora-zero2.yaml b/examples/training/deepspeed/qlora/kubernetes/qwen3_8b-qlora-zero2.yaml similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/kubernetes/qwen3_8b-qlora-zero2.yaml rename to examples/training/deepspeed/qlora/kubernetes/qwen3_8b-qlora-zero2.yaml diff --git a/3.test_cases/pytorch/deepspeed/qlora/kubernetes/qwen3_8b-qlora-zero3.yaml b/examples/training/deepspeed/qlora/kubernetes/qwen3_8b-qlora-zero3.yaml similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/kubernetes/qwen3_8b-qlora-zero3.yaml rename to examples/training/deepspeed/qlora/kubernetes/qwen3_8b-qlora-zero3.yaml diff --git a/3.test_cases/pytorch/deepspeed/qlora/kubernetes/storage.yaml b/examples/training/deepspeed/qlora/kubernetes/storage.yaml similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/kubernetes/storage.yaml rename to examples/training/deepspeed/qlora/kubernetes/storage.yaml diff --git a/3.test_cases/pytorch/deepspeed/qlora/requirements.txt b/examples/training/deepspeed/qlora/requirements.txt similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/requirements.txt rename to examples/training/deepspeed/qlora/requirements.txt diff --git a/3.test_cases/pytorch/deepspeed/qlora/slurm/README.md b/examples/training/deepspeed/qlora/slurm/README.md similarity index 96% rename from 3.test_cases/pytorch/deepspeed/qlora/slurm/README.md rename to examples/training/deepspeed/qlora/slurm/README.md index bb9068b41..fd66a3e68 100644 --- a/3.test_cases/pytorch/deepspeed/qlora/slurm/README.md +++ b/examples/training/deepspeed/qlora/slurm/README.md @@ -7,7 +7,7 @@ For the EKS/Kubernetes deployment path, see the [top-level README](../README.md) ## Prerequisites 1. **SageMaker HyperPod cluster** with GPU worker nodes provisioned and running. - See [`1.architectures/5.sagemaker-hyperpod/`](https://github.com/awslabs/awsome-distributed-training/tree/main/1.architectures/5.sagemaker-hyperpod) for cluster setup. + See [`architectures/sagemaker-hyperpod-slurm/`](https://github.com/awslabs/awsome-distributed-ai/tree/main/architectures/sagemaker-hyperpod-slurm) for cluster setup. 2. **Shared filesystem** — HyperPod clusters use Amazon FSx for Lustre mounted at `/fsx`. @@ -31,8 +31,8 @@ ssh ubuntu@ # Clone the repo to shared storage cd /fsx -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/3.test_cases/pytorch/deepspeed/qlora +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/examples/training/deepspeed/qlora # Install the venv package (not pre-installed on HyperPod AMI) sudo apt-get update && sudo apt-get install -y python3.10-venv @@ -77,7 +77,7 @@ mv qwen3-qlora+latest.sqsh /fsx/containers/qwen3-qlora.sqsh ### ZeRO-2 (Default) ```bash -cd /fsx/awsome-distributed-training/3.test_cases/pytorch/deepspeed/qlora/slurm +cd /fsx/awsome-distributed-ai/examples/training/deepspeed/qlora/slurm # Activate venv (skip if using container mode) source /fsx/venvs/qwen3-qlora/bin/activate diff --git a/3.test_cases/pytorch/deepspeed/qlora/slurm/qwen3_8b-qlora-zero2.sbatch b/examples/training/deepspeed/qlora/slurm/qwen3_8b-qlora-zero2.sbatch similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/slurm/qwen3_8b-qlora-zero2.sbatch rename to examples/training/deepspeed/qlora/slurm/qwen3_8b-qlora-zero2.sbatch diff --git a/3.test_cases/pytorch/deepspeed/qlora/slurm/qwen3_8b-qlora-zero3.sbatch b/examples/training/deepspeed/qlora/slurm/qwen3_8b-qlora-zero3.sbatch similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/slurm/qwen3_8b-qlora-zero3.sbatch rename to examples/training/deepspeed/qlora/slurm/qwen3_8b-qlora-zero3.sbatch diff --git a/3.test_cases/pytorch/FSDP/src/model_utils/__init__.py b/examples/training/deepspeed/qlora/src/__init__.py similarity index 100% rename from 3.test_cases/pytorch/FSDP/src/model_utils/__init__.py rename to examples/training/deepspeed/qlora/src/__init__.py diff --git a/3.test_cases/pytorch/deepspeed/qlora/src/config.py b/examples/training/deepspeed/qlora/src/config.py similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/src/config.py rename to examples/training/deepspeed/qlora/src/config.py diff --git a/3.test_cases/pytorch/deepspeed/qlora/src/data_preparation.py b/examples/training/deepspeed/qlora/src/data_preparation.py similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/src/data_preparation.py rename to examples/training/deepspeed/qlora/src/data_preparation.py diff --git a/3.test_cases/pytorch/deepspeed/qlora/src/inference_demo.py b/examples/training/deepspeed/qlora/src/inference_demo.py similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/src/inference_demo.py rename to examples/training/deepspeed/qlora/src/inference_demo.py diff --git a/3.test_cases/pytorch/deepspeed/qlora/src/model_setup.py b/examples/training/deepspeed/qlora/src/model_setup.py similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/src/model_setup.py rename to examples/training/deepspeed/qlora/src/model_setup.py diff --git a/3.test_cases/pytorch/deepspeed/qlora/src/train.py b/examples/training/deepspeed/qlora/src/train.py similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/src/train.py rename to examples/training/deepspeed/qlora/src/train.py diff --git a/3.test_cases/pytorch/FSDP/.gitignore b/examples/training/fsdp/.gitignore similarity index 100% rename from 3.test_cases/pytorch/FSDP/.gitignore rename to examples/training/fsdp/.gitignore diff --git a/3.test_cases/pytorch/FSDP/Dockerfile b/examples/training/fsdp/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/FSDP/Dockerfile rename to examples/training/fsdp/Dockerfile diff --git a/3.test_cases/pytorch/FSDP/README.md b/examples/training/fsdp/README.md similarity index 92% rename from 3.test_cases/pytorch/FSDP/README.md rename to examples/training/fsdp/README.md index 3d48bf79a..031f8afd7 100644 --- a/3.test_cases/pytorch/FSDP/README.md +++ b/examples/training/fsdp/README.md @@ -6,7 +6,7 @@ It is designed to be simple with no data preparation or tokenizer to download, a ## Prerequisites To run FSDP training, you will need to create a training cluster based on Slurm or Kubermetes with an [Amazon FSx for Lustre](https://docs.aws.amazon.com/fsx/latest/LustreGuide/what-is.html) -You can find instruction how to create a Amazon SageMaker Hyperpod cluster with [Slurm](https://catalog.workshops.aws/sagemaker-hyperpod/en-US), [Kubernetes](https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US) or with in [Amazon EKS](../../1.architectures). +You can find instruction how to create a Amazon SageMaker Hyperpod cluster with [Slurm](https://catalog.workshops.aws/sagemaker-hyperpod/en-US), [Kubernetes](https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US) or with in [Amazon EKS](../../../architectures). ## FSDP Training diff --git a/3.test_cases/pytorch/FSDP/generate-sbatch-training-files.py b/examples/training/fsdp/generate-sbatch-training-files.py similarity index 100% rename from 3.test_cases/pytorch/FSDP/generate-sbatch-training-files.py rename to examples/training/fsdp/generate-sbatch-training-files.py diff --git a/3.test_cases/pytorch/FSDP/kubernetes/README.md b/examples/training/fsdp/kubernetes/README.md similarity index 95% rename from 3.test_cases/pytorch/FSDP/kubernetes/README.md rename to examples/training/fsdp/kubernetes/README.md index 9c6ce9f91..93e5782bb 100644 --- a/3.test_cases/pytorch/FSDP/kubernetes/README.md +++ b/examples/training/fsdp/kubernetes/README.md @@ -7,7 +7,7 @@ This document will run you through how to run Llama 3.1 8B model training with F ## 0. Prerequisites ### 0.1. EKS Cluster -Before running this training, you'll need to create an Amazon EKS or a SageMaker HyperPod EKS cluster. Instructions can be found in [1.architectures](../../1.architectures), the [aws-do-eks](https://bit.ly/do-eks) project, or the [eks-blueprints](https://github.com/aws-ia/terraform-aws-eks-blueprints) project. +Before running this training, you'll need to create an Amazon EKS or a SageMaker HyperPod EKS cluster. Instructions can be found in [architectures](../../../../architectures), the [aws-do-eks](https://bit.ly/do-eks) project, or the [eks-blueprints](https://github.com/aws-ia/terraform-aws-eks-blueprints) project. ### 0.2. Connect to your EKS Cluster @@ -23,12 +23,12 @@ kubectl config current-context ``` arn:aws:eks:us-west-1:xxxxxxxxxxxx:cluster/xxx-eks-cluster ``` -### 0.3. Clone the awsome-distributed-training reposource code +### 0.3. Clone the awsome-distributed-ai reposource code Clone this repo. ``` -git clone https://github.com/awslabs/awsome-distributed-training/ -cd awsome-distributed-training/3.test_cases/pytorch/FSDP/kubernetes +git clone https://github.com/awslabs/awsome-distributed-ai/ +cd awsome-distributed-ai/examples/training/fsdp/kubernetes ``` ### 0.4. Envsubst @@ -54,7 +54,7 @@ docker build -f Dockerfile -t ${REGISTRY}fsdp:pytorch2.7.1 . popd ``` -The PyTorch FSDP container uses the [nccl-tests](https://github.com/awslabs/awsome-distributed-training/blob/main/micro-benchmarks/nccl-tests/nccl-tests.Dockerfile) container as base. +The PyTorch FSDP container uses the [nccl-tests](https://github.com/awslabs/awsome-distributed-ai/blob/main/micro-benchmarks/nccl-tests/nccl-tests.Dockerfile) container as base. ## 2. Push container image to Amazon ECR diff --git a/3.test_cases/pytorch/FSDP/kubernetes/fsdp.yaml-template b/examples/training/fsdp/kubernetes/fsdp.yaml-template similarity index 100% rename from 3.test_cases/pytorch/FSDP/kubernetes/fsdp.yaml-template rename to examples/training/fsdp/kubernetes/fsdp.yaml-template diff --git a/3.test_cases/pytorch/FSDP/kubernetes/llama2_13b-fsdp.yaml b/examples/training/fsdp/kubernetes/llama2_13b-fsdp.yaml similarity index 100% rename from 3.test_cases/pytorch/FSDP/kubernetes/llama2_13b-fsdp.yaml rename to examples/training/fsdp/kubernetes/llama2_13b-fsdp.yaml diff --git a/3.test_cases/pytorch/FSDP/kubernetes/llama2_70b-fsdp.yaml b/examples/training/fsdp/kubernetes/llama2_70b-fsdp.yaml similarity index 100% rename from 3.test_cases/pytorch/FSDP/kubernetes/llama2_70b-fsdp.yaml rename to examples/training/fsdp/kubernetes/llama2_70b-fsdp.yaml diff --git a/3.test_cases/pytorch/FSDP/kubernetes/llama2_7b-fsdp.yaml b/examples/training/fsdp/kubernetes/llama2_7b-fsdp.yaml similarity index 100% rename from 3.test_cases/pytorch/FSDP/kubernetes/llama2_7b-fsdp.yaml rename to examples/training/fsdp/kubernetes/llama2_7b-fsdp.yaml diff --git a/3.test_cases/pytorch/FSDP/kubernetes/llama3_1_70b-fsdp.yaml b/examples/training/fsdp/kubernetes/llama3_1_70b-fsdp.yaml similarity index 100% rename from 3.test_cases/pytorch/FSDP/kubernetes/llama3_1_70b-fsdp.yaml rename to examples/training/fsdp/kubernetes/llama3_1_70b-fsdp.yaml diff --git a/3.test_cases/pytorch/FSDP/kubernetes/llama3_1_8b-fsdp-hpto.yaml b/examples/training/fsdp/kubernetes/llama3_1_8b-fsdp-hpto.yaml similarity index 100% rename from 3.test_cases/pytorch/FSDP/kubernetes/llama3_1_8b-fsdp-hpto.yaml rename to examples/training/fsdp/kubernetes/llama3_1_8b-fsdp-hpto.yaml diff --git a/3.test_cases/pytorch/FSDP/kubernetes/llama3_1_8b-fsdp.yaml b/examples/training/fsdp/kubernetes/llama3_1_8b-fsdp.yaml similarity index 100% rename from 3.test_cases/pytorch/FSDP/kubernetes/llama3_1_8b-fsdp.yaml rename to examples/training/fsdp/kubernetes/llama3_1_8b-fsdp.yaml diff --git a/3.test_cases/pytorch/FSDP/kubernetes/llama3_2_1b-fsdp-hpto.yaml b/examples/training/fsdp/kubernetes/llama3_2_1b-fsdp-hpto.yaml similarity index 100% rename from 3.test_cases/pytorch/FSDP/kubernetes/llama3_2_1b-fsdp-hpto.yaml rename to examples/training/fsdp/kubernetes/llama3_2_1b-fsdp-hpto.yaml diff --git a/3.test_cases/pytorch/FSDP/kubernetes/llama3_2_1b-fsdp.yaml b/examples/training/fsdp/kubernetes/llama3_2_1b-fsdp.yaml similarity index 100% rename from 3.test_cases/pytorch/FSDP/kubernetes/llama3_2_1b-fsdp.yaml rename to examples/training/fsdp/kubernetes/llama3_2_1b-fsdp.yaml diff --git a/3.test_cases/pytorch/FSDP/kubernetes/llama3_2_3b-fsdp.yaml b/examples/training/fsdp/kubernetes/llama3_2_3b-fsdp.yaml similarity index 100% rename from 3.test_cases/pytorch/FSDP/kubernetes/llama3_2_3b-fsdp.yaml rename to examples/training/fsdp/kubernetes/llama3_2_3b-fsdp.yaml diff --git a/3.test_cases/pytorch/FSDP/kubernetes/mathstral_7b-fsdp.yaml b/examples/training/fsdp/kubernetes/mathstral_7b-fsdp.yaml similarity index 100% rename from 3.test_cases/pytorch/FSDP/kubernetes/mathstral_7b-fsdp.yaml rename to examples/training/fsdp/kubernetes/mathstral_7b-fsdp.yaml diff --git a/3.test_cases/pytorch/FSDP/kubernetes/mistral_8x7b-fsdp.yaml b/examples/training/fsdp/kubernetes/mistral_8x7b-fsdp.yaml similarity index 100% rename from 3.test_cases/pytorch/FSDP/kubernetes/mistral_8x7b-fsdp.yaml rename to examples/training/fsdp/kubernetes/mistral_8x7b-fsdp.yaml diff --git a/3.test_cases/pytorch/FSDP/kubernetes/training_kubernetes.template b/examples/training/fsdp/kubernetes/training_kubernetes.template similarity index 100% rename from 3.test_cases/pytorch/FSDP/kubernetes/training_kubernetes.template rename to examples/training/fsdp/kubernetes/training_kubernetes.template diff --git a/3.test_cases/pytorch/FSDP/models/llama2_13b.txt b/examples/training/fsdp/models/llama2_13b.txt similarity index 100% rename from 3.test_cases/pytorch/FSDP/models/llama2_13b.txt rename to examples/training/fsdp/models/llama2_13b.txt diff --git a/3.test_cases/pytorch/FSDP/models/llama2_70b.txt b/examples/training/fsdp/models/llama2_70b.txt similarity index 100% rename from 3.test_cases/pytorch/FSDP/models/llama2_70b.txt rename to examples/training/fsdp/models/llama2_70b.txt diff --git a/3.test_cases/pytorch/FSDP/models/llama2_7b.txt b/examples/training/fsdp/models/llama2_7b.txt similarity index 100% rename from 3.test_cases/pytorch/FSDP/models/llama2_7b.txt rename to examples/training/fsdp/models/llama2_7b.txt diff --git a/3.test_cases/pytorch/FSDP/models/llama3_1_70b.txt b/examples/training/fsdp/models/llama3_1_70b.txt similarity index 100% rename from 3.test_cases/pytorch/FSDP/models/llama3_1_70b.txt rename to examples/training/fsdp/models/llama3_1_70b.txt diff --git a/3.test_cases/pytorch/FSDP/models/llama3_1_8b.txt b/examples/training/fsdp/models/llama3_1_8b.txt similarity index 100% rename from 3.test_cases/pytorch/FSDP/models/llama3_1_8b.txt rename to examples/training/fsdp/models/llama3_1_8b.txt diff --git a/3.test_cases/pytorch/FSDP/models/llama3_2_1b.txt b/examples/training/fsdp/models/llama3_2_1b.txt similarity index 100% rename from 3.test_cases/pytorch/FSDP/models/llama3_2_1b.txt rename to examples/training/fsdp/models/llama3_2_1b.txt diff --git a/3.test_cases/pytorch/FSDP/models/llama3_2_3b.txt b/examples/training/fsdp/models/llama3_2_3b.txt similarity index 100% rename from 3.test_cases/pytorch/FSDP/models/llama3_2_3b.txt rename to examples/training/fsdp/models/llama3_2_3b.txt diff --git a/3.test_cases/pytorch/FSDP/models/mathstral_7b.txt b/examples/training/fsdp/models/mathstral_7b.txt similarity index 100% rename from 3.test_cases/pytorch/FSDP/models/mathstral_7b.txt rename to examples/training/fsdp/models/mathstral_7b.txt diff --git a/3.test_cases/pytorch/FSDP/models/mistral_8x7b.txt b/examples/training/fsdp/models/mistral_8x7b.txt similarity index 100% rename from 3.test_cases/pytorch/FSDP/models/mistral_8x7b.txt rename to examples/training/fsdp/models/mistral_8x7b.txt diff --git a/3.test_cases/pytorch/FSDP/slurm/README.md b/examples/training/fsdp/slurm/README.md similarity index 99% rename from 3.test_cases/pytorch/FSDP/slurm/README.md rename to examples/training/fsdp/slurm/README.md index 203798d49..f12eb7283 100644 --- a/3.test_cases/pytorch/FSDP/slurm/README.md +++ b/examples/training/fsdp/slurm/README.md @@ -11,8 +11,8 @@ On your cluster head node, ```bash cd /fsx -git clone https://github.com/awslabs/awsome-distributed-training/ -cd awsome-distributed-training/3.test_cases/pytorch/FSDP/slurm +git clone https://github.com/awslabs/awsome-distributed-ai/ +cd awsome-distributed-ai/examples/training/fsdp/slurm ``` 3. You can launch the training through: diff --git a/3.test_cases/pytorch/FSDP/slurm/create_venv.sh b/examples/training/fsdp/slurm/create_venv.sh similarity index 100% rename from 3.test_cases/pytorch/FSDP/slurm/create_venv.sh rename to examples/training/fsdp/slurm/create_venv.sh diff --git a/3.test_cases/pytorch/FSDP/slurm/llama2_13b-training.sbatch b/examples/training/fsdp/slurm/llama2_13b-training.sbatch similarity index 100% rename from 3.test_cases/pytorch/FSDP/slurm/llama2_13b-training.sbatch rename to examples/training/fsdp/slurm/llama2_13b-training.sbatch diff --git a/3.test_cases/pytorch/FSDP/slurm/llama2_70b-training.sbatch b/examples/training/fsdp/slurm/llama2_70b-training.sbatch similarity index 100% rename from 3.test_cases/pytorch/FSDP/slurm/llama2_70b-training.sbatch rename to examples/training/fsdp/slurm/llama2_70b-training.sbatch diff --git a/3.test_cases/pytorch/FSDP/slurm/llama2_7b-training.sbatch b/examples/training/fsdp/slurm/llama2_7b-training.sbatch similarity index 100% rename from 3.test_cases/pytorch/FSDP/slurm/llama2_7b-training.sbatch rename to examples/training/fsdp/slurm/llama2_7b-training.sbatch diff --git a/3.test_cases/pytorch/FSDP/slurm/llama3_1_70b-training.sbatch b/examples/training/fsdp/slurm/llama3_1_70b-training.sbatch similarity index 100% rename from 3.test_cases/pytorch/FSDP/slurm/llama3_1_70b-training.sbatch rename to examples/training/fsdp/slurm/llama3_1_70b-training.sbatch diff --git a/3.test_cases/pytorch/FSDP/slurm/llama3_1_8b-training.sbatch b/examples/training/fsdp/slurm/llama3_1_8b-training.sbatch similarity index 100% rename from 3.test_cases/pytorch/FSDP/slurm/llama3_1_8b-training.sbatch rename to examples/training/fsdp/slurm/llama3_1_8b-training.sbatch diff --git a/3.test_cases/pytorch/FSDP/slurm/llama3_2_1b-training.sbatch b/examples/training/fsdp/slurm/llama3_2_1b-training.sbatch similarity index 100% rename from 3.test_cases/pytorch/FSDP/slurm/llama3_2_1b-training.sbatch rename to examples/training/fsdp/slurm/llama3_2_1b-training.sbatch diff --git a/3.test_cases/pytorch/FSDP/slurm/llama3_2_3b-training.sbatch b/examples/training/fsdp/slurm/llama3_2_3b-training.sbatch similarity index 100% rename from 3.test_cases/pytorch/FSDP/slurm/llama3_2_3b-training.sbatch rename to examples/training/fsdp/slurm/llama3_2_3b-training.sbatch diff --git a/3.test_cases/pytorch/FSDP/slurm/mathstral_7b-training.sbatch b/examples/training/fsdp/slurm/mathstral_7b-training.sbatch similarity index 100% rename from 3.test_cases/pytorch/FSDP/slurm/mathstral_7b-training.sbatch rename to examples/training/fsdp/slurm/mathstral_7b-training.sbatch diff --git a/3.test_cases/pytorch/FSDP/slurm/mistral_8x7b-training.sbatch b/examples/training/fsdp/slurm/mistral_8x7b-training.sbatch similarity index 100% rename from 3.test_cases/pytorch/FSDP/slurm/mistral_8x7b-training.sbatch rename to examples/training/fsdp/slurm/mistral_8x7b-training.sbatch diff --git a/3.test_cases/pytorch/FSDP/slurm/training-sub.template b/examples/training/fsdp/slurm/training-sub.template similarity index 100% rename from 3.test_cases/pytorch/FSDP/slurm/training-sub.template rename to examples/training/fsdp/slurm/training-sub.template diff --git a/3.test_cases/pytorch/deepspeed/qlora/src/__init__.py b/examples/training/fsdp/src/model_utils/__init__.py similarity index 100% rename from 3.test_cases/pytorch/deepspeed/qlora/src/__init__.py rename to examples/training/fsdp/src/model_utils/__init__.py diff --git a/3.test_cases/pytorch/FSDP/src/model_utils/arguments.py b/examples/training/fsdp/src/model_utils/arguments.py similarity index 100% rename from 3.test_cases/pytorch/FSDP/src/model_utils/arguments.py rename to examples/training/fsdp/src/model_utils/arguments.py diff --git a/3.test_cases/pytorch/FSDP/src/model_utils/checkpoint.py b/examples/training/fsdp/src/model_utils/checkpoint.py similarity index 100% rename from 3.test_cases/pytorch/FSDP/src/model_utils/checkpoint.py rename to examples/training/fsdp/src/model_utils/checkpoint.py diff --git a/3.test_cases/pytorch/FSDP/src/model_utils/concat_dataset.py b/examples/training/fsdp/src/model_utils/concat_dataset.py similarity index 100% rename from 3.test_cases/pytorch/FSDP/src/model_utils/concat_dataset.py rename to examples/training/fsdp/src/model_utils/concat_dataset.py diff --git a/3.test_cases/pytorch/FSDP/src/model_utils/train_utils.py b/examples/training/fsdp/src/model_utils/train_utils.py similarity index 100% rename from 3.test_cases/pytorch/FSDP/src/model_utils/train_utils.py rename to examples/training/fsdp/src/model_utils/train_utils.py diff --git a/3.test_cases/pytorch/FSDP/src/requirements.txt b/examples/training/fsdp/src/requirements.txt similarity index 100% rename from 3.test_cases/pytorch/FSDP/src/requirements.txt rename to examples/training/fsdp/src/requirements.txt diff --git a/3.test_cases/pytorch/FSDP/src/train.py b/examples/training/fsdp/src/train.py similarity index 100% rename from 3.test_cases/pytorch/FSDP/src/train.py rename to examples/training/fsdp/src/train.py diff --git a/3.test_cases/jax/README.md b/examples/training/jax/README.md similarity index 100% rename from 3.test_cases/jax/README.md rename to examples/training/jax/README.md diff --git a/3.test_cases/jax/jax.sbatch b/examples/training/jax/jax.sbatch similarity index 100% rename from 3.test_cases/jax/jax.sbatch rename to examples/training/jax/jax.sbatch diff --git a/3.test_cases/jax/jax_paxml.Dockerfile b/examples/training/jax/jax_paxml.Dockerfile similarity index 100% rename from 3.test_cases/jax/jax_paxml.Dockerfile rename to examples/training/jax/jax_paxml.Dockerfile diff --git a/3.test_cases/jax/run_paxml.sh b/examples/training/jax/run_paxml.sh similarity index 100% rename from 3.test_cases/jax/run_paxml.sh rename to examples/training/jax/run_paxml.sh diff --git a/3.test_cases/megatron/megatron-lm/README.md b/examples/training/megatron-lm/README.md similarity index 100% rename from 3.test_cases/megatron/megatron-lm/README.md rename to examples/training/megatron-lm/README.md diff --git a/3.test_cases/megatron/megatron-lm/aws-megatron-lm.Dockerfile b/examples/training/megatron-lm/aws-megatron-lm.Dockerfile similarity index 100% rename from 3.test_cases/megatron/megatron-lm/aws-megatron-lm.Dockerfile rename to examples/training/megatron-lm/aws-megatron-lm.Dockerfile diff --git a/3.test_cases/megatron/megatron-lm/kubernetes/README.md b/examples/training/megatron-lm/kubernetes/README.md similarity index 100% rename from 3.test_cases/megatron/megatron-lm/kubernetes/README.md rename to examples/training/megatron-lm/kubernetes/README.md diff --git a/3.test_cases/megatron/megatron-lm/kubernetes/gpt3/README.md b/examples/training/megatron-lm/kubernetes/gpt3/README.md similarity index 100% rename from 3.test_cases/megatron/megatron-lm/kubernetes/gpt3/README.md rename to examples/training/megatron-lm/kubernetes/gpt3/README.md diff --git a/3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/.gitignore b/examples/training/megatron-lm/kubernetes/gpt3/manifests/.gitignore similarity index 100% rename from 3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/.gitignore rename to examples/training/megatron-lm/kubernetes/gpt3/manifests/.gitignore diff --git a/3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/getdata-job.yaml-template b/examples/training/megatron-lm/kubernetes/gpt3/manifests/getdata-job.yaml-template similarity index 100% rename from 3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/getdata-job.yaml-template rename to examples/training/megatron-lm/kubernetes/gpt3/manifests/getdata-job.yaml-template diff --git a/3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/prepdata-job.yaml-template b/examples/training/megatron-lm/kubernetes/gpt3/manifests/prepdata-job.yaml-template similarity index 100% rename from 3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/prepdata-job.yaml-template rename to examples/training/megatron-lm/kubernetes/gpt3/manifests/prepdata-job.yaml-template diff --git a/3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/pytorchjob.yaml-template b/examples/training/megatron-lm/kubernetes/gpt3/manifests/pytorchjob.yaml-template similarity index 100% rename from 3.test_cases/megatron/megatron-lm/kubernetes/gpt3/manifests/pytorchjob.yaml-template rename to examples/training/megatron-lm/kubernetes/gpt3/manifests/pytorchjob.yaml-template diff --git a/3.test_cases/megatron/megatron-lm/slurm/Makefile b/examples/training/megatron-lm/slurm/Makefile similarity index 100% rename from 3.test_cases/megatron/megatron-lm/slurm/Makefile rename to examples/training/megatron-lm/slurm/Makefile diff --git a/3.test_cases/megatron/megatron-lm/slurm/README.md b/examples/training/megatron-lm/slurm/README.md similarity index 95% rename from 3.test_cases/megatron/megatron-lm/slurm/README.md rename to examples/training/megatron-lm/slurm/README.md index 5eb98d714..2806e3e06 100755 --- a/3.test_cases/megatron/megatron-lm/slurm/README.md +++ b/examples/training/megatron-lm/slurm/README.md @@ -11,7 +11,7 @@ This guide assumes that you have the following: - Docker, for Slurm [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) need to be installed as well. - An FSx for Lustre filesystem mounted on `/fsx` in all Slurm nodes. -It is recommended that you use the templates for [AWS Parallel Cluster](../../../1.architectures/2.aws-parallelcluster/) or [Amazon SageMaker HyperPod Slurm](../../../1.architectures/5.sagemaker-hyperpod) set up. +It is recommended that you use the templates for [AWS Parallel Cluster](../../../../architectures/aws-parallelcluster/) or [Amazon SageMaker HyperPod Slurm](../../../../architectures/sagemaker-hyperpod-slurm) set up. You will also setup the following variables in your terminal environment. diff --git a/3.test_cases/megatron/megatron-lm/slurm/gpt3/1.data-preprocessing.sbatch b/examples/training/megatron-lm/slurm/gpt3/1.data-preprocessing.sbatch similarity index 100% rename from 3.test_cases/megatron/megatron-lm/slurm/gpt3/1.data-preprocessing.sbatch rename to examples/training/megatron-lm/slurm/gpt3/1.data-preprocessing.sbatch diff --git a/3.test_cases/megatron/megatron-lm/slurm/gpt3/2.distributed-training.sbatch b/examples/training/megatron-lm/slurm/gpt3/2.distributed-training.sbatch similarity index 100% rename from 3.test_cases/megatron/megatron-lm/slurm/gpt3/2.distributed-training.sbatch rename to examples/training/megatron-lm/slurm/gpt3/2.distributed-training.sbatch diff --git a/3.test_cases/megatron/megatron-lm/slurm/gpt3/README.md b/examples/training/megatron-lm/slurm/gpt3/README.md similarity index 100% rename from 3.test_cases/megatron/megatron-lm/slurm/gpt3/README.md rename to examples/training/megatron-lm/slurm/gpt3/README.md diff --git a/3.test_cases/megatron/megatron-lm/slurm/llama2/README.md b/examples/training/megatron-lm/slurm/llama2/README.md similarity index 100% rename from 3.test_cases/megatron/megatron-lm/slurm/llama2/README.md rename to examples/training/megatron-lm/slurm/llama2/README.md diff --git a/3.test_cases/megatron/megatron-lm/slurm/llama2/data-preproc-llama2.sbatch b/examples/training/megatron-lm/slurm/llama2/data-preproc-llama2.sbatch similarity index 100% rename from 3.test_cases/megatron/megatron-lm/slurm/llama2/data-preproc-llama2.sbatch rename to examples/training/megatron-lm/slurm/llama2/data-preproc-llama2.sbatch diff --git a/3.test_cases/megatron/megatron-lm/slurm/llama2/pretrain-llama2.sbatch b/examples/training/megatron-lm/slurm/llama2/pretrain-llama2.sbatch similarity index 100% rename from 3.test_cases/megatron/megatron-lm/slurm/llama2/pretrain-llama2.sbatch rename to examples/training/megatron-lm/slurm/llama2/pretrain-llama2.sbatch diff --git a/3.test_cases/megatron/megatron-lm/slurm/llama3/README.md b/examples/training/megatron-lm/slurm/llama3/README.md similarity index 100% rename from 3.test_cases/megatron/megatron-lm/slurm/llama3/README.md rename to examples/training/megatron-lm/slurm/llama3/README.md diff --git a/3.test_cases/megatron/megatron-lm/slurm/llama3/pretrain-llama3-8b.sbatch b/examples/training/megatron-lm/slurm/llama3/pretrain-llama3-8b.sbatch similarity index 100% rename from 3.test_cases/megatron/megatron-lm/slurm/llama3/pretrain-llama3-8b.sbatch rename to examples/training/megatron-lm/slurm/llama3/pretrain-llama3-8b.sbatch diff --git a/3.test_cases/megatron/megatron-lm/test_megatron_lm.py b/examples/training/megatron-lm/test_megatron_lm.py similarity index 100% rename from 3.test_cases/megatron/megatron-lm/test_megatron_lm.py rename to examples/training/megatron-lm/test_megatron_lm.py diff --git a/3.test_cases/pytorch/mosaicml-composer/mpt/0.llm-foundry.Dockerfile b/examples/training/mosaicml-composer/mpt/0.llm-foundry.Dockerfile similarity index 100% rename from 3.test_cases/pytorch/mosaicml-composer/mpt/0.llm-foundry.Dockerfile rename to examples/training/mosaicml-composer/mpt/0.llm-foundry.Dockerfile diff --git a/3.test_cases/pytorch/mosaicml-composer/mpt/1.c4-preprocess.sbatch b/examples/training/mosaicml-composer/mpt/1.c4-preprocess.sbatch similarity index 100% rename from 3.test_cases/pytorch/mosaicml-composer/mpt/1.c4-preprocess.sbatch rename to examples/training/mosaicml-composer/mpt/1.c4-preprocess.sbatch diff --git a/3.test_cases/pytorch/mosaicml-composer/mpt/2.train-mpt-manual-distributed.sbatch b/examples/training/mosaicml-composer/mpt/2.train-mpt-manual-distributed.sbatch similarity index 100% rename from 3.test_cases/pytorch/mosaicml-composer/mpt/2.train-mpt-manual-distributed.sbatch rename to examples/training/mosaicml-composer/mpt/2.train-mpt-manual-distributed.sbatch diff --git a/3.test_cases/pytorch/mosaicml-composer/mpt/Makefile b/examples/training/mosaicml-composer/mpt/Makefile similarity index 100% rename from 3.test_cases/pytorch/mosaicml-composer/mpt/Makefile rename to examples/training/mosaicml-composer/mpt/Makefile diff --git a/3.test_cases/pytorch/mosaicml-composer/mpt/README.md b/examples/training/mosaicml-composer/mpt/README.md similarity index 96% rename from 3.test_cases/pytorch/mosaicml-composer/mpt/README.md rename to examples/training/mosaicml-composer/mpt/README.md index ae4daa09e..28fc0b5d5 100644 --- a/3.test_cases/pytorch/mosaicml-composer/mpt/README.md +++ b/examples/training/mosaicml-composer/mpt/README.md @@ -15,7 +15,7 @@ This guide assumes that you have the following: * Docker, [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) installed. * An FSx for Lustre filesystem mounted on `/fsx`. -We recommend that you setup a Slurm cluster using the templates in the architectures [directory](../../1.architectures). Before creating the Slurm cluster, you need to setup the following environment variables: +We recommend that you setup a Slurm cluster using the templates in the architectures [directory](../../../../architectures). Before creating the Slurm cluster, you need to setup the following environment variables: ```bash export APPS_PATH=/apps @@ -26,7 +26,7 @@ export TEST_CASE_PATH=${HOME}/3.MPT # where you copy the test case or set to yo cd $TEST_CASE_PATH ``` -then follow the detailed instructions [here](../../1.architectures/2.aws-parallelcluster/README.md). +then follow the detailed instructions [here](../../../../architectures/aws-parallelcluster/README.md). ## 2. Build the container diff --git a/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/README.md b/examples/training/mosaicml-composer/stable-diffusion/README.md similarity index 94% rename from 3.test_cases/pytorch/mosaicml-composer/stable-diffusion/README.md rename to examples/training/mosaicml-composer/stable-diffusion/README.md index b2c9cf1ae..c07e32f0f 100644 --- a/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/README.md +++ b/examples/training/mosaicml-composer/stable-diffusion/README.md @@ -104,7 +104,7 @@ Once this change is done, you can install composer as `pip3 install -e .` The `single-node` folder also has the Dockerfile with commands to build the image and run the container. If you are opting to setup training with a Conda environment, then this setup is not needed. Run this setup only if you need to run MosaicML Composer from within a Nvidia PyTorch container. ```bash -cd awsome-distributed-training/3.test_cases/6.stable-diffusion/single-node +cd awsome-distributed-ai/examples/6.stable-diffusion/single-node # build the image docker build --build-arg MOSAICML_VERSION=${MOSAICML_VERSION} --build-arg PYTORCH_IMAGE=${PYTORCH_IMAGE} --build-arg PYTORCH_INDEX_URL=${PYTORCH_INDEX_URL} -t ${DOCKER_IMAGE_NAME}:${TAG} -f 0.Dockerfile . @@ -186,7 +186,7 @@ More details on this can be found here: https://pytorch.org/blog/accelerated-dif ### 2.1 Multi-Node Training with Slurm -For the multi-node training we've created a [Dockerfile](https://github.com/awslabs/awsome-distributed-training/blob/multi-node/3.test_cases/6.stable-diffusion/multi-node/1.Dockerfile), and Slurm submit script to submit the training job. To get started please follow the guide [AWS ParallelCluster Distributed Training](../../1.architectures/2.aws-parallelcluster). Before starting this section make sure you have the following setup: +For the multi-node training we've created a [Dockerfile](https://github.com/awslabs/awsome-distributed-ai/blob/multi-node/examples/6.stable-diffusion/multi-node/1.Dockerfile), and Slurm submit script to submit the training job. To get started please follow the guide [AWS ParallelCluster Distributed Training](../../../../architectures/aws-parallelcluster). Before starting this section make sure you have the following setup: * AWS ParallelCluster >= 3.7.0 * Pyxis @@ -196,8 +196,8 @@ For the multi-node training we've created a [Dockerfile](https://github.com/awsl 1. To get started, clone this repo and cd into the multi-node directory: ``` -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/6.stable-diffusion/multi-node +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/6.stable-diffusion/multi-node ``` #### 2.1.1 Next build the docker image: @@ -240,7 +240,7 @@ rain Epoch 0: 100%|████████████████ ### 2.2 Multi-Node Training with Amazon EKS -Next we will show how to train stable diffusion with Mosaic ML's [composer](https://github.com/mosaicml/composer/tree/dev) on [Amazon EKS](https://aws.amazon.com/eks/). To start we have created an EKS cluster following the steps [here](https://github.com/awslabs/awsome-distributed-training/tree/main/1.architectures/4.amazon-eks). You can follow these steps to add a nodegroup of `p5.48xlarge` instances. First export these environment variables. +Next we will show how to train stable diffusion with Mosaic ML's [composer](https://github.com/mosaicml/composer/tree/dev) on [Amazon EKS](https://aws.amazon.com/eks/). To start we have created an EKS cluster following the steps [here](https://github.com/awslabs/awsome-distributed-ai/tree/main/architectures/amazon-eks). You can follow these steps to add a nodegroup of `p5.48xlarge` instances. First export these environment variables. ```bash export AWS_REGION=us-west-2 @@ -391,7 +391,7 @@ cd /eks/deployment/kubeflow/training-operator #### 2.2.6 Now we can start training -We provide a template YAML file for submitting the stable diffusion distributed training job in [3.stable-diffusion-eks.yaml-template](https://github.com/awslabs/awsome-distributed-training/blob/stable-diffusion-eks/3.test_cases/6.stable-diffusion/multi-node/3.stable-diffusion-eks.yaml-template). You can substitute the environment variables in the template manifest as: +We provide a template YAML file for submitting the stable diffusion distributed training job in [3.stable-diffusion-eks.yaml-template](https://github.com/awslabs/awsome-distributed-ai/blob/stable-diffusion-eks/examples/6.stable-diffusion/multi-node/3.stable-diffusion-eks.yaml-template). You can substitute the environment variables in the template manifest as: ```bash cat 3.mosaicml-sd-eks.yaml-template | envsubst > mosaicml-sd-eks.yaml diff --git a/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/1.Dockerfile b/examples/training/mosaicml-composer/stable-diffusion/multi-node/1.Dockerfile similarity index 100% rename from 3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/1.Dockerfile rename to examples/training/mosaicml-composer/stable-diffusion/multi-node/1.Dockerfile diff --git a/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/2.train.sbatch b/examples/training/mosaicml-composer/stable-diffusion/multi-node/2.train.sbatch similarity index 100% rename from 3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/2.train.sbatch rename to examples/training/mosaicml-composer/stable-diffusion/multi-node/2.train.sbatch diff --git a/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/3.stable-diffusion-eks.yaml-template b/examples/training/mosaicml-composer/stable-diffusion/multi-node/3.stable-diffusion-eks.yaml-template similarity index 100% rename from 3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/3.stable-diffusion-eks.yaml-template rename to examples/training/mosaicml-composer/stable-diffusion/multi-node/3.stable-diffusion-eks.yaml-template diff --git a/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/4.etcd.yaml b/examples/training/mosaicml-composer/stable-diffusion/multi-node/4.etcd.yaml similarity index 100% rename from 3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/4.etcd.yaml rename to examples/training/mosaicml-composer/stable-diffusion/multi-node/4.etcd.yaml diff --git a/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/p5-model-scaling-stable-diff-throughput.png b/examples/training/mosaicml-composer/stable-diffusion/multi-node/p5-model-scaling-stable-diff-throughput.png similarity index 100% rename from 3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/p5-model-scaling-stable-diff-throughput.png rename to examples/training/mosaicml-composer/stable-diffusion/multi-node/p5-model-scaling-stable-diff-throughput.png diff --git a/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/p5-model-scaling-stable-diff.png b/examples/training/mosaicml-composer/stable-diffusion/multi-node/p5-model-scaling-stable-diff.png similarity index 100% rename from 3.test_cases/pytorch/mosaicml-composer/stable-diffusion/multi-node/p5-model-scaling-stable-diff.png rename to examples/training/mosaicml-composer/stable-diffusion/multi-node/p5-model-scaling-stable-diff.png diff --git a/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/single-node/0.Dockerfile b/examples/training/mosaicml-composer/stable-diffusion/single-node/0.Dockerfile similarity index 100% rename from 3.test_cases/pytorch/mosaicml-composer/stable-diffusion/single-node/0.Dockerfile rename to examples/training/mosaicml-composer/stable-diffusion/single-node/0.Dockerfile diff --git a/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/single-node/calculate_number_of_parameters.py b/examples/training/mosaicml-composer/stable-diffusion/single-node/calculate_number_of_parameters.py similarity index 100% rename from 3.test_cases/pytorch/mosaicml-composer/stable-diffusion/single-node/calculate_number_of_parameters.py rename to examples/training/mosaicml-composer/stable-diffusion/single-node/calculate_number_of_parameters.py diff --git a/3.test_cases/pytorch/mosaicml-composer/stable-diffusion/single-node/sd_p4de_p5.png b/examples/training/mosaicml-composer/stable-diffusion/single-node/sd_p4de_p5.png similarity index 100% rename from 3.test_cases/pytorch/mosaicml-composer/stable-diffusion/single-node/sd_p4de_p5.png rename to examples/training/mosaicml-composer/stable-diffusion/single-node/sd_p4de_p5.png diff --git a/3.test_cases/megatron/nemo-rl/Dockerfile b/examples/training/nemo-rl/Dockerfile similarity index 100% rename from 3.test_cases/megatron/nemo-rl/Dockerfile rename to examples/training/nemo-rl/Dockerfile diff --git a/3.test_cases/megatron/nemo-rl/README.md b/examples/training/nemo-rl/README.md similarity index 100% rename from 3.test_cases/megatron/nemo-rl/README.md rename to examples/training/nemo-rl/README.md diff --git a/3.test_cases/megatron/nemo-rl/grpo/.trivyignore b/examples/training/nemo-rl/grpo/.trivyignore similarity index 100% rename from 3.test_cases/megatron/nemo-rl/grpo/.trivyignore rename to examples/training/nemo-rl/grpo/.trivyignore diff --git a/3.test_cases/megatron/nemo-rl/grpo/Dockerfile b/examples/training/nemo-rl/grpo/Dockerfile similarity index 100% rename from 3.test_cases/megatron/nemo-rl/grpo/Dockerfile rename to examples/training/nemo-rl/grpo/Dockerfile diff --git a/3.test_cases/megatron/nemo-rl/grpo/README.md b/examples/training/nemo-rl/grpo/README.md similarity index 98% rename from 3.test_cases/megatron/nemo-rl/grpo/README.md rename to examples/training/nemo-rl/grpo/README.md index 7642526bb..67eae554d 100644 --- a/3.test_cases/megatron/nemo-rl/grpo/README.md +++ b/examples/training/nemo-rl/grpo/README.md @@ -77,7 +77,7 @@ Each example is a JSONL line: The dataset is expected at `/fsx/goldilocks/train.jsonl` and `/fsx/goldilocks/test.jsonl` (mounted via FSx). To generate your own: -1. See the sibling generator: `3.test_cases/megatron/nemo-rl/data-prep/generate_goldilocks_data_designer.py` +1. See the sibling generator: `examples/training/nemo-rl/data-prep/generate_goldilocks_data_designer.py` 2. Or use any math problem dataset with `prompt` and `answer` fields in JSONL format. The `kubernetes/rayjob-grpo.yaml` manifest mounts `/fsx/goldilocks/` — update the `goldilocksPath` env var in the manifest if your path differs. diff --git a/3.test_cases/megatron/nemo-rl/grpo/THIRD-PARTY-LICENSES b/examples/training/nemo-rl/grpo/THIRD-PARTY-LICENSES similarity index 100% rename from 3.test_cases/megatron/nemo-rl/grpo/THIRD-PARTY-LICENSES rename to examples/training/nemo-rl/grpo/THIRD-PARTY-LICENSES diff --git a/3.test_cases/megatron/nemo-rl/grpo/eval_nemotron_goldilocks.py b/examples/training/nemo-rl/grpo/eval_nemotron_goldilocks.py similarity index 100% rename from 3.test_cases/megatron/nemo-rl/grpo/eval_nemotron_goldilocks.py rename to examples/training/nemo-rl/grpo/eval_nemotron_goldilocks.py diff --git a/3.test_cases/megatron/nemo-rl/grpo/generate_sbom.sh b/examples/training/nemo-rl/grpo/generate_sbom.sh similarity index 100% rename from 3.test_cases/megatron/nemo-rl/grpo/generate_sbom.sh rename to examples/training/nemo-rl/grpo/generate_sbom.sh diff --git a/3.test_cases/megatron/nemo-rl/grpo/kubernetes/rayjob-grpo.yaml b/examples/training/nemo-rl/grpo/kubernetes/rayjob-grpo.yaml similarity index 100% rename from 3.test_cases/megatron/nemo-rl/grpo/kubernetes/rayjob-grpo.yaml rename to examples/training/nemo-rl/grpo/kubernetes/rayjob-grpo.yaml diff --git a/3.test_cases/megatron/nemo-rl/grpo/rayjob_entrypoint.sh b/examples/training/nemo-rl/grpo/rayjob_entrypoint.sh similarity index 100% rename from 3.test_cases/megatron/nemo-rl/grpo/rayjob_entrypoint.sh rename to examples/training/nemo-rl/grpo/rayjob_entrypoint.sh diff --git a/3.test_cases/megatron/nemo-rl/kubernetes/dataset-download-job.yaml b/examples/training/nemo-rl/kubernetes/dataset-download-job.yaml similarity index 100% rename from 3.test_cases/megatron/nemo-rl/kubernetes/dataset-download-job.yaml rename to examples/training/nemo-rl/kubernetes/dataset-download-job.yaml diff --git a/3.test_cases/megatron/nemo-rl/kubernetes/rayjob.yaml b/examples/training/nemo-rl/kubernetes/rayjob.yaml similarity index 98% rename from 3.test_cases/megatron/nemo-rl/kubernetes/rayjob.yaml rename to examples/training/nemo-rl/kubernetes/rayjob.yaml index adf00023a..6599e01b0 100644 --- a/3.test_cases/megatron/nemo-rl/kubernetes/rayjob.yaml +++ b/examples/training/nemo-rl/kubernetes/rayjob.yaml @@ -84,7 +84,7 @@ spec: - name: GRPO_MAX_STEPS value: "20" # ── NCCL / EFA ── - # Exclusion pattern per repo convention (see 1.architectures/efa-cheatsheet.md). + # Exclusion pattern per repo convention (see architectures/efa-cheatsheet.md). # hostNetwork: true means the container sees host interfaces — excluding # lo/docker/veth is sufficient to pick the node's primary ENI on g5. - name: NCCL_SOCKET_IFNAME diff --git a/3.test_cases/megatron/nemo-rl/patches/patch_nvrx_features.py b/examples/training/nemo-rl/patches/patch_nvrx_features.py similarity index 100% rename from 3.test_cases/megatron/nemo-rl/patches/patch_nvrx_features.py rename to examples/training/nemo-rl/patches/patch_nvrx_features.py diff --git a/3.test_cases/megatron/nemo-rl/scripts/evaluate_before_after.py b/examples/training/nemo-rl/scripts/evaluate_before_after.py similarity index 100% rename from 3.test_cases/megatron/nemo-rl/scripts/evaluate_before_after.py rename to examples/training/nemo-rl/scripts/evaluate_before_after.py diff --git a/3.test_cases/megatron/nemo-rl/scripts/rayjob_entrypoint.sh b/examples/training/nemo-rl/scripts/rayjob_entrypoint.sh similarity index 100% rename from 3.test_cases/megatron/nemo-rl/scripts/rayjob_entrypoint.sh rename to examples/training/nemo-rl/scripts/rayjob_entrypoint.sh diff --git a/3.test_cases/megatron/nemo-rl/scripts/run_grpo_nvrx.py b/examples/training/nemo-rl/scripts/run_grpo_nvrx.py similarity index 100% rename from 3.test_cases/megatron/nemo-rl/scripts/run_grpo_nvrx.py rename to examples/training/nemo-rl/scripts/run_grpo_nvrx.py diff --git a/3.test_cases/megatron/nemo/Dockerfile b/examples/training/nemo/Dockerfile similarity index 100% rename from 3.test_cases/megatron/nemo/Dockerfile rename to examples/training/nemo/Dockerfile diff --git a/3.test_cases/megatron/nemo/PERFORMANCE.md b/examples/training/nemo/PERFORMANCE.md similarity index 97% rename from 3.test_cases/megatron/nemo/PERFORMANCE.md rename to examples/training/nemo/PERFORMANCE.md index 7e68ac9ef..250726ea1 100644 --- a/3.test_cases/megatron/nemo/PERFORMANCE.md +++ b/examples/training/nemo/PERFORMANCE.md @@ -19,7 +19,7 @@ This document describes the process of performance measurements of NeMo 2.x fram * [NVIDIA NeMo Performance Scripts](https://github.com/NVIDIA/NeMo/tree/main/scripts/performance/llm) * [NVIDIA NeMo Compatibility Matrix](https://docs.nvidia.com/nemo-framework/user-guide/latest/softwarecomponentversions.html) * [AWS EFA Documentation](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html) -* [EFA Cheatsheet](../../../1.architectures/efa-cheatsheet.md) +* [EFA Cheatsheet](../../../architectures/efa-cheatsheet.md) ### NeMo Version Compatibility @@ -60,7 +60,7 @@ Default location is `~/.nemo_run/experiments/`. ### Build Docker Image -The Dockerfile extends the NVIDIA NeMo container with AWS EFA (Elastic Fabric Adapter) support for high-performance networking. See the [Dockerfile](../Dockerfile) in this directory for the complete configuration. +The Dockerfile extends the NVIDIA NeMo container with AWS EFA (Elastic Fabric Adapter) support for high-performance networking. See the [Dockerfile](Dockerfile) in this directory for the complete configuration. Key components installed: - **EFA installer (v1.47.0)** - provides libfabric and Open MPI @@ -90,7 +90,7 @@ Create an `env_vars.json` file with optimized settings for EFA: } ``` -See the [EFA Cheatsheet](../../../1.architectures/efa-cheatsheet.md) for detailed explanations of each variable. +See the [EFA Cheatsheet](../../../architectures/efa-cheatsheet.md) for detailed explanations of each variable. ### Run Performance Test @@ -130,7 +130,7 @@ The performance scripts support multi-node training via Slurm. The `--num_gpus` | 128 | 8 | 16 | | 256 | 8 | 32 | -Ensure your Slurm partition has sufficient nodes available. See the [Slurm README](../slurm/README.md) for detailed setup instructions. +Ensure your Slurm partition has sufficient nodes available. See the [Slurm README](slurm/README.md) for detailed setup instructions. ## Pre-training Performance diff --git a/3.test_cases/megatron/nemo/README.md b/examples/training/nemo/README.md similarity index 100% rename from 3.test_cases/megatron/nemo/README.md rename to examples/training/nemo/README.md diff --git a/3.test_cases/megatron/nemo/kubernetes/Dockerfile b/examples/training/nemo/kubernetes/Dockerfile similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/Dockerfile rename to examples/training/nemo/kubernetes/Dockerfile diff --git a/3.test_cases/megatron/nemo/kubernetes/README.md b/examples/training/nemo/kubernetes/README.md similarity index 99% rename from 3.test_cases/megatron/nemo/kubernetes/README.md rename to examples/training/nemo/kubernetes/README.md index 01fa661dc..08defdfa9 100644 --- a/3.test_cases/megatron/nemo/kubernetes/README.md +++ b/examples/training/nemo/kubernetes/README.md @@ -69,8 +69,8 @@ This implementation leverages Kubernetes on AWS infrastructure to orchestrate di Before you begin, ensure you have the following: - **Kubernetes Cluster**: An EKS cluster or SageMaker HyperPod EKS cluster - - For SageMaker HyperPod EKS: Follow [this workshop](https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US/00-setup) or [this repository](https://github.com/awslabs/awsome-distributed-training/tree/main/1.architectures/7.sagemaker-hyperpod-eks) - - For standard EKS: Follow [these instructions](https://github.com/awslabs/awsome-distributed-training/tree/main/1.architectures/4.amazon-eks) + - For SageMaker HyperPod EKS: Follow [this workshop](https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US/00-setup) or [this repository](https://github.com/awslabs/awsome-distributed-ai/tree/main/architectures/sagemaker-hyperpod-eks) + - For standard EKS: Follow [these instructions](https://github.com/awslabs/awsome-distributed-ai/tree/main/architectures/amazon-eks) **Additional Setup for Standard EKS (Non-HyperPod):** diff --git a/3.test_cases/megatron/nemo/kubernetes/build.sh b/examples/training/nemo/kubernetes/build.sh similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/build.sh rename to examples/training/nemo/kubernetes/build.sh diff --git a/3.test_cases/megatron/nemo/kubernetes/custom_data_module.py b/examples/training/nemo/kubernetes/custom_data_module.py similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/custom_data_module.py rename to examples/training/nemo/kubernetes/custom_data_module.py diff --git a/3.test_cases/megatron/nemo/kubernetes/data-processing/data-processing-pod-template.yaml b/examples/training/nemo/kubernetes/data-processing/data-processing-pod-template.yaml similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/data-processing/data-processing-pod-template.yaml rename to examples/training/nemo/kubernetes/data-processing/data-processing-pod-template.yaml diff --git a/3.test_cases/megatron/nemo/kubernetes/data-processing/data-processing.sh b/examples/training/nemo/kubernetes/data-processing/data-processing.sh similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/data-processing/data-processing.sh rename to examples/training/nemo/kubernetes/data-processing/data-processing.sh diff --git a/3.test_cases/megatron/nemo/kubernetes/data-processing/load_dataset.py b/examples/training/nemo/kubernetes/data-processing/load_dataset.py similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/data-processing/load_dataset.py rename to examples/training/nemo/kubernetes/data-processing/load_dataset.py diff --git a/3.test_cases/megatron/nemo/kubernetes/env_vars.json b/examples/training/nemo/kubernetes/env_vars.json similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/env_vars.json rename to examples/training/nemo/kubernetes/env_vars.json diff --git a/3.test_cases/megatron/nemo/kubernetes/finetune_custom_dataset.py b/examples/training/nemo/kubernetes/finetune_custom_dataset.py similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/finetune_custom_dataset.py rename to examples/training/nemo/kubernetes/finetune_custom_dataset.py diff --git a/3.test_cases/megatron/nemo/kubernetes/finetune_default_dataset.py b/examples/training/nemo/kubernetes/finetune_default_dataset.py similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/finetune_default_dataset.py rename to examples/training/nemo/kubernetes/finetune_default_dataset.py diff --git a/3.test_cases/megatron/nemo/kubernetes/megatron/megatron-gpt-345m_merges b/examples/training/nemo/kubernetes/megatron/megatron-gpt-345m_merges similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/megatron/megatron-gpt-345m_merges rename to examples/training/nemo/kubernetes/megatron/megatron-gpt-345m_merges diff --git a/3.test_cases/megatron/nemo/kubernetes/megatron/megatron-gpt-345m_vocab b/examples/training/nemo/kubernetes/megatron/megatron-gpt-345m_vocab similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/megatron/megatron-gpt-345m_vocab rename to examples/training/nemo/kubernetes/megatron/megatron-gpt-345m_vocab diff --git a/3.test_cases/megatron/nemo/kubernetes/pretrain_custom_dataset.py b/examples/training/nemo/kubernetes/pretrain_custom_dataset.py similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/pretrain_custom_dataset.py rename to examples/training/nemo/kubernetes/pretrain_custom_dataset.py diff --git a/3.test_cases/megatron/nemo/kubernetes/pretrain_mock_dataset.py b/examples/training/nemo/kubernetes/pretrain_mock_dataset.py similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/pretrain_mock_dataset.py rename to examples/training/nemo/kubernetes/pretrain_mock_dataset.py diff --git a/3.test_cases/megatron/nemo/kubernetes/push.sh b/examples/training/nemo/kubernetes/push.sh similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/push.sh rename to examples/training/nemo/kubernetes/push.sh diff --git a/3.test_cases/megatron/nemo/kubernetes/venv.sh b/examples/training/nemo/kubernetes/venv.sh similarity index 100% rename from 3.test_cases/megatron/nemo/kubernetes/venv.sh rename to examples/training/nemo/kubernetes/venv.sh diff --git a/3.test_cases/megatron/nemo/slurm/.gitignore b/examples/training/nemo/slurm/.gitignore similarity index 100% rename from 3.test_cases/megatron/nemo/slurm/.gitignore rename to examples/training/nemo/slurm/.gitignore diff --git a/3.test_cases/megatron/nemo/slurm/README.md b/examples/training/nemo/slurm/README.md similarity index 93% rename from 3.test_cases/megatron/nemo/slurm/README.md rename to examples/training/nemo/slurm/README.md index 72abe4205..0dcffd75a 100644 --- a/3.test_cases/megatron/nemo/slurm/README.md +++ b/examples/training/nemo/slurm/README.md @@ -10,7 +10,7 @@ This guide assumes that you have the following: - Docker, for Slurm [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) need to be installed as well. - An FSx for Lustre filesystem mounted on `/fsx` in all Slurm nodes. Also, this test case assumes that the home directory is also a shared directory. -It is recommended that you use the templates in the architectures [directory](../../1.architectures) for setting up Amazon SageMaker HyperPod or AWS Parallel Cluster. +It is recommended that you use the templates in the architectures [directory](../../../../architectures) for setting up Amazon SageMaker HyperPod or AWS Parallel Cluster. Make sure that your current directory is under a shared filesystem such as `/fsx`. @@ -19,8 +19,8 @@ Make sure that your current directory is under a shared filesystem such as `/fsx ```bash cd ~ - git clone https://github.com/awslabs/awsome-distributed-training/ - cd awsome-distributed-training/3.test_cases/22.nemo-run/slurm + git clone https://github.com/awslabs/awsome-distributed-ai/ + cd awsome-distributed-ai/examples/22.nemo-run/slurm ``` ## 4. Build and Configure the NeMo Job Container diff --git a/3.test_cases/megatron/nemo/slurm/env_vars.json b/examples/training/nemo/slurm/env_vars.json similarity index 100% rename from 3.test_cases/megatron/nemo/slurm/env_vars.json rename to examples/training/nemo/slurm/env_vars.json diff --git a/3.test_cases/megatron/nemo/slurm/run.py b/examples/training/nemo/slurm/run.py similarity index 100% rename from 3.test_cases/megatron/nemo/slurm/run.py rename to examples/training/nemo/slurm/run.py diff --git a/3.test_cases/megatron/nemo/slurm/venv.sh b/examples/training/nemo/slurm/venv.sh similarity index 100% rename from 3.test_cases/megatron/nemo/slurm/venv.sh rename to examples/training/nemo/slurm/venv.sh diff --git a/3.test_cases/megatron/nemo1.0/.gitignore b/examples/training/nemo1.0/.gitignore similarity index 100% rename from 3.test_cases/megatron/nemo1.0/.gitignore rename to examples/training/nemo1.0/.gitignore diff --git a/3.test_cases/megatron/nemo1.0/0.NemoMegatron-aws-optimized.Dockerfile b/examples/training/nemo1.0/0.NemoMegatron-aws-optimized.Dockerfile similarity index 100% rename from 3.test_cases/megatron/nemo1.0/0.NemoMegatron-aws-optimized.Dockerfile rename to examples/training/nemo1.0/0.NemoMegatron-aws-optimized.Dockerfile diff --git a/3.test_cases/megatron/nemo1.0/1.bmk-pretrain-gpt3-126m.sh b/examples/training/nemo1.0/1.bmk-pretrain-gpt3-126m.sh similarity index 100% rename from 3.test_cases/megatron/nemo1.0/1.bmk-pretrain-gpt3-126m.sh rename to examples/training/nemo1.0/1.bmk-pretrain-gpt3-126m.sh diff --git a/3.test_cases/megatron/nemo1.0/2.bmk-pretrain-gpt3-5b.sh b/examples/training/nemo1.0/2.bmk-pretrain-gpt3-5b.sh similarity index 100% rename from 3.test_cases/megatron/nemo1.0/2.bmk-pretrain-gpt3-5b.sh rename to examples/training/nemo1.0/2.bmk-pretrain-gpt3-5b.sh diff --git a/3.test_cases/megatron/nemo1.0/3.bmk-pretrain-gpt3-40b.sh b/examples/training/nemo1.0/3.bmk-pretrain-gpt3-40b.sh similarity index 100% rename from 3.test_cases/megatron/nemo1.0/3.bmk-pretrain-gpt3-40b.sh rename to examples/training/nemo1.0/3.bmk-pretrain-gpt3-40b.sh diff --git a/3.test_cases/megatron/nemo1.0/4.bmk-pretrain-gpt3-175b.sh b/examples/training/nemo1.0/4.bmk-pretrain-gpt3-175b.sh similarity index 100% rename from 3.test_cases/megatron/nemo1.0/4.bmk-pretrain-gpt3-175b.sh rename to examples/training/nemo1.0/4.bmk-pretrain-gpt3-175b.sh diff --git a/3.test_cases/megatron/nemo1.0/5.bmk-pretrain-llama-7b.sh b/examples/training/nemo1.0/5.bmk-pretrain-llama-7b.sh similarity index 100% rename from 3.test_cases/megatron/nemo1.0/5.bmk-pretrain-llama-7b.sh rename to examples/training/nemo1.0/5.bmk-pretrain-llama-7b.sh diff --git a/3.test_cases/megatron/nemo1.0/6.bmk-pretrain-llama-70b.sh b/examples/training/nemo1.0/6.bmk-pretrain-llama-70b.sh similarity index 100% rename from 3.test_cases/megatron/nemo1.0/6.bmk-pretrain-llama-70b.sh rename to examples/training/nemo1.0/6.bmk-pretrain-llama-70b.sh diff --git a/3.test_cases/megatron/nemo1.0/EKS/0.Dockerfile b/examples/training/nemo1.0/EKS/0.Dockerfile similarity index 100% rename from 3.test_cases/megatron/nemo1.0/EKS/0.Dockerfile rename to examples/training/nemo1.0/EKS/0.Dockerfile diff --git a/3.test_cases/megatron/nemo1.0/EKS/README.md b/examples/training/nemo1.0/EKS/README.md similarity index 96% rename from 3.test_cases/megatron/nemo1.0/EKS/README.md rename to examples/training/nemo1.0/EKS/README.md index 63d35250e..550a0d820 100644 --- a/3.test_cases/megatron/nemo1.0/EKS/README.md +++ b/examples/training/nemo1.0/EKS/README.md @@ -5,7 +5,7 @@ In this work we will present a step by step guide to run distributed training workloads on an [Amazon EKS](https://aws.amazon.com/eks/) cluster. ## 0. Prerequisites -We require that to run this workload, you have a 2 node P4de or P5 cluster available with EFA enabled and a [Amazon FSx for Lustre](https://aws.amazon.com/fsx/lustre/) mounted on that cluster. You can follow the steps at [4.amazon-eks](https://github.com/awslabs/awsome-distributed-training/tree/1.architectures/4.amazon-eks) to create a EFA enabled EKS cluster with P4de nodes. To this end, we provide the cluster creation config in `p4de-cluster-config.yaml`. +We require that to run this workload, you have a 2 node P4de or P5 cluster available with EFA enabled and a [Amazon FSx for Lustre](https://aws.amazon.com/fsx/lustre/) mounted on that cluster. You can follow the steps at [amazon-eks](https://github.com/awslabs/awsome-distributed-ai/tree/architectures/amazon-eks) to create a EFA enabled EKS cluster with P4de nodes. To this end, we provide the cluster creation config in `p4de-cluster-config.yaml`. This config will create 2 managed node groups, one for the system node `c5.2xlarge` and one `p4de.24xlarge`. Managed node groups will use EKS optimized AMIs. @@ -225,8 +225,8 @@ docker cp -a : /opt/NeMo-Megatron-Launcher/${LAUNCHER_SCRIPTS_PATH Run the following to install the necessary dependencies to run NeMo. ```bash -git clone https://github.com/awslabs/awsome-distributed-training.git -cd ./awsome-distributed-training/3.test_cases/2.nemo-launcher/EKS/ +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd ./awsome-distributed-ai/examples/2.nemo-launcher/EKS/ pip install -r requirements.txt ``` @@ -279,8 +279,8 @@ FILE_NUMBERS="0-5" # Number of files to be downloaded out of the 30 files. With Run the following next to substitute the environment variables in the yaml file and place it in the right location: ```bash -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/3.test_cases/2.nemo-launcher/EKS/launcher_scripts/conf +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/examples/2.nemo-launcher/EKS/launcher_scripts/conf envsubst < ./config.yaml > ${LAUNCHER_SCRIPTS_PATH}/launcher_scripts/conf/config.yaml envsubst < ./cluster/k8s.yaml > ${LAUNCHER_SCRIPTS_PATH}/launcher_scripts/conf/cluster/k8s.yaml diff --git a/3.test_cases/megatron/nemo1.0/EKS/fsx.png b/examples/training/nemo1.0/EKS/fsx.png similarity index 100% rename from 3.test_cases/megatron/nemo1.0/EKS/fsx.png rename to examples/training/nemo1.0/EKS/fsx.png diff --git a/3.test_cases/megatron/nemo1.0/EKS/fsx/fsx-pv.yaml b/examples/training/nemo1.0/EKS/fsx/fsx-pv.yaml similarity index 100% rename from 3.test_cases/megatron/nemo1.0/EKS/fsx/fsx-pv.yaml rename to examples/training/nemo1.0/EKS/fsx/fsx-pv.yaml diff --git a/3.test_cases/megatron/nemo1.0/EKS/fsx/fsx-pvc.yaml b/examples/training/nemo1.0/EKS/fsx/fsx-pvc.yaml similarity index 100% rename from 3.test_cases/megatron/nemo1.0/EKS/fsx/fsx-pvc.yaml rename to examples/training/nemo1.0/EKS/fsx/fsx-pvc.yaml diff --git a/3.test_cases/megatron/nemo1.0/EKS/fsx/fsx-storage-class.yaml b/examples/training/nemo1.0/EKS/fsx/fsx-storage-class.yaml similarity index 100% rename from 3.test_cases/megatron/nemo1.0/EKS/fsx/fsx-storage-class.yaml rename to examples/training/nemo1.0/EKS/fsx/fsx-storage-class.yaml diff --git a/3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/cluster/k8s.yaml b/examples/training/nemo1.0/EKS/launcher_scripts/conf/cluster/k8s.yaml similarity index 100% rename from 3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/cluster/k8s.yaml rename to examples/training/nemo1.0/EKS/launcher_scripts/conf/cluster/k8s.yaml diff --git a/3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/config.yaml b/examples/training/nemo1.0/EKS/launcher_scripts/conf/config.yaml similarity index 100% rename from 3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/config.yaml rename to examples/training/nemo1.0/EKS/launcher_scripts/conf/config.yaml diff --git a/3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/data_preparation/gpt3/data_prep.yaml b/examples/training/nemo1.0/EKS/launcher_scripts/conf/data_preparation/gpt3/data_prep.yaml similarity index 100% rename from 3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/data_preparation/gpt3/data_prep.yaml rename to examples/training/nemo1.0/EKS/launcher_scripts/conf/data_preparation/gpt3/data_prep.yaml diff --git a/3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml b/examples/training/nemo1.0/EKS/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml similarity index 100% rename from 3.test_cases/megatron/nemo1.0/EKS/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml rename to examples/training/nemo1.0/EKS/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml diff --git a/3.test_cases/megatron/nemo1.0/EKS/p4de-cluster-config.yaml b/examples/training/nemo1.0/EKS/p4de-cluster-config.yaml similarity index 100% rename from 3.test_cases/megatron/nemo1.0/EKS/p4de-cluster-config.yaml rename to examples/training/nemo1.0/EKS/p4de-cluster-config.yaml diff --git a/3.test_cases/megatron/nemo1.0/README.md b/examples/training/nemo1.0/README.md similarity index 99% rename from 3.test_cases/megatron/nemo1.0/README.md rename to examples/training/nemo1.0/README.md index 6bddc87fe..c25a182be 100644 --- a/3.test_cases/megatron/nemo1.0/README.md +++ b/examples/training/nemo1.0/README.md @@ -23,7 +23,7 @@ The following pre-requisites are needed to run this example: - You are using p4de.24xlarge instances with A100 80GB or newer, with at least 80GB of memory per GPU. - You have access to the base image [NeMo Framework Training](https://registry.ngc.nvidia.com/orgs/ea-bignlp/teams/ga-participants/containers/nemofw-training). To gain access to this image, go to [Get Access to NeMo Framework](https://developer.nvidia.com/nemo-framework) to enroll to organization/team `ea-bignlp/ga-participant`. -- Docker, [Enroot](https://github.com/NVIDIA/enroot) and [Pixys](https://github.com/NVIDIA/pyxis) installed on the cluster and available on all nodes. It is assumed you are using a Custom AMI ([example](../../2.ami_and_containers/1.amazon_machine_image)) +- Docker, [Enroot](https://github.com/NVIDIA/enroot) and [Pixys](https://github.com/NVIDIA/pyxis) installed on the cluster and available on all nodes. It is assumed you are using a Custom AMI ([example](../../../ami_and_containers/amazon_machine_image)) You will need to setup the following environment variables before running the scripts. : diff --git a/3.test_cases/megatron/nemo1.0/conf.template/cluster/bcm.yaml b/examples/training/nemo1.0/conf.template/cluster/bcm.yaml similarity index 100% rename from 3.test_cases/megatron/nemo1.0/conf.template/cluster/bcm.yaml rename to examples/training/nemo1.0/conf.template/cluster/bcm.yaml diff --git a/3.test_cases/megatron/nemo1.0/conf.template/config.yaml b/examples/training/nemo1.0/conf.template/config.yaml similarity index 100% rename from 3.test_cases/megatron/nemo1.0/conf.template/config.yaml rename to examples/training/nemo1.0/conf.template/config.yaml diff --git a/3.test_cases/megatron/nemo1.0/test_nemo_launcher.py b/examples/training/nemo1.0/test_nemo_launcher.py similarity index 100% rename from 3.test_cases/megatron/nemo1.0/test_nemo_launcher.py rename to examples/training/nemo1.0/test_nemo_launcher.py diff --git a/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/Dockerfile b/examples/training/neuronx-distributed/llama3/kubernetes/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/Dockerfile rename to examples/training/neuronx-distributed/llama3/kubernetes/Dockerfile diff --git a/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/README.md b/examples/training/neuronx-distributed/llama3/kubernetes/README.md similarity index 94% rename from 3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/README.md rename to examples/training/neuronx-distributed/llama3/kubernetes/README.md index 270214d04..b03cf4d0e 100644 --- a/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/README.md +++ b/examples/training/neuronx-distributed/llama3/kubernetes/README.md @@ -14,7 +14,7 @@ In this section, we showcase how to pre-train Llama3-8B, Llama3 8B model using T ## 0. Prerequisites ### 0.1. EKS Cluster -Before running this training, you'll need to create an Amazon EKS or a SageMaker HyperPod EKS cluster with atleast 1 trn1.32xlarge/ trn1n.32xlarge. Instructions can be found in [1.architectures](../../1.architectures), the [aws-do-eks](https://bit.ly/do-eks) project, or the [eks-blueprints](https://github.com/aws-ia/terraform-aws-eks-blueprints) project. +Before running this training, you'll need to create an Amazon EKS or a SageMaker HyperPod EKS cluster with atleast 1 trn1.32xlarge/ trn1n.32xlarge. Instructions can be found in [architectures](../../../../../architectures), the [aws-do-eks](https://bit.ly/do-eks) project, or the [eks-blueprints](https://github.com/aws-ia/terraform-aws-eks-blueprints) project. ### 0.2 HF Access token diff --git a/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/generate-jobspec.sh b/examples/training/neuronx-distributed/llama3/kubernetes/generate-jobspec.sh similarity index 100% rename from 3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/generate-jobspec.sh rename to examples/training/neuronx-distributed/llama3/kubernetes/generate-jobspec.sh diff --git a/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/llama3_train.yaml-template b/examples/training/neuronx-distributed/llama3/kubernetes/llama3_train.yaml-template similarity index 100% rename from 3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/llama3_train.yaml-template rename to examples/training/neuronx-distributed/llama3/kubernetes/llama3_train.yaml-template diff --git a/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/src/tokenize_data.py b/examples/training/neuronx-distributed/llama3/kubernetes/src/tokenize_data.py similarity index 100% rename from 3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/src/tokenize_data.py rename to examples/training/neuronx-distributed/llama3/kubernetes/src/tokenize_data.py diff --git a/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/tokenize_data.yaml-template b/examples/training/neuronx-distributed/llama3/kubernetes/tokenize_data.yaml-template similarity index 100% rename from 3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes/tokenize_data.yaml-template rename to examples/training/neuronx-distributed/llama3/kubernetes/tokenize_data.yaml-template diff --git a/3.test_cases/pytorch/neuronx-distributed/llama3/slurm/README.md b/examples/training/neuronx-distributed/llama3/slurm/README.md similarity index 100% rename from 3.test_cases/pytorch/neuronx-distributed/llama3/slurm/README.md rename to examples/training/neuronx-distributed/llama3/slurm/README.md diff --git a/3.test_cases/pytorch/nvrx/.gitignore b/examples/training/nvrx/.gitignore similarity index 100% rename from 3.test_cases/pytorch/nvrx/.gitignore rename to examples/training/nvrx/.gitignore diff --git a/3.test_cases/pytorch/nvrx/Dockerfile b/examples/training/nvrx/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/nvrx/Dockerfile rename to examples/training/nvrx/Dockerfile diff --git a/3.test_cases/pytorch/nvrx/README.md b/examples/training/nvrx/README.md similarity index 97% rename from 3.test_cases/pytorch/nvrx/README.md rename to examples/training/nvrx/README.md index beb619677..47eca3652 100644 --- a/3.test_cases/pytorch/nvrx/README.md +++ b/examples/training/nvrx/README.md @@ -43,14 +43,14 @@ Two injection modes are supported: ## Prerequisites -To run these tests, you need a training cluster with GPU nodes and shared storage. Instructions for creating a cluster can be found in [1.architectures](../../../1.architectures), the [aws-do-eks](https://bit.ly/do-eks) project, or [EKS Blueprints](https://github.com/aws-ia/terraform-aws-eks-blueprints). +To run these tests, you need a training cluster with GPU nodes and shared storage. Instructions for creating a cluster can be found in [architectures](../../../architectures), the [aws-do-eks](https://bit.ly/do-eks) project, or [EKS Blueprints](https://github.com/aws-ia/terraform-aws-eks-blueprints). ## 1. Build Container Image From the `nvrx/` directory, build a container image with PyTorch 2.9, NVRx 0.4.1, and the training scripts: ```bash -cd 3.test_cases/pytorch/nvrx +cd examples/training/nvrx export AWS_REGION=$(aws ec2 describe-availability-zones --output text --query 'AvailabilityZones[0].[RegionName]') export ACCOUNT=$(aws sts get-caller-identity --query Account --output text) diff --git a/3.test_cases/pytorch/nvrx/build_and_push.sh b/examples/training/nvrx/build_and_push.sh similarity index 100% rename from 3.test_cases/pytorch/nvrx/build_and_push.sh rename to examples/training/nvrx/build_and_push.sh diff --git a/3.test_cases/pytorch/nvrx/env_vars.template b/examples/training/nvrx/env_vars.template similarity index 100% rename from 3.test_cases/pytorch/nvrx/env_vars.template rename to examples/training/nvrx/env_vars.template diff --git a/3.test_cases/pytorch/nvrx/kubernetes/README.md b/examples/training/nvrx/kubernetes/README.md similarity index 95% rename from 3.test_cases/pytorch/nvrx/kubernetes/README.md rename to examples/training/nvrx/kubernetes/README.md index 9c77bf0f2..255dda6ef 100644 --- a/3.test_cases/pytorch/nvrx/kubernetes/README.md +++ b/examples/training/nvrx/kubernetes/README.md @@ -6,7 +6,7 @@ This guide walks you through deploying NVRx resiliency experiments on an Amazon ### 0.1. EKS Cluster -You need an Amazon EKS cluster with GPU nodes and EFA networking. Instructions for creating a cluster can be found in [1.architectures](../../../../1.architectures), the [aws-do-eks](https://bit.ly/do-eks) project, or [EKS Blueprints](https://github.com/aws-ia/terraform-aws-eks-blueprints). +You need an Amazon EKS cluster with GPU nodes and EFA networking. Instructions for creating a cluster can be found in [architectures](../../../../architectures), the [aws-do-eks](https://bit.ly/do-eks) project, or [EKS Blueprints](https://github.com/aws-ia/terraform-aws-eks-blueprints). Your cluster must have: - GPU nodes (g5, p4de, or p5 instances) with [NVIDIA device plugin](https://github.com/NVIDIA/k8s-device-plugin) @@ -30,7 +30,7 @@ If the [envsubst](https://github.com/a8m/envsubst) utility is not available in y All commands in this README are run from the `kubernetes/` directory: ```bash -cd 3.test_cases/pytorch/nvrx/kubernetes +cd examples/training/nvrx/kubernetes ``` ## 1. Create Namespace diff --git a/3.test_cases/pytorch/nvrx/kubernetes/deploy.sh b/examples/training/nvrx/kubernetes/deploy.sh similarity index 100% rename from 3.test_cases/pytorch/nvrx/kubernetes/deploy.sh rename to examples/training/nvrx/kubernetes/deploy.sh diff --git a/3.test_cases/pytorch/nvrx/kubernetes/fsx-storage.yaml b/examples/training/nvrx/kubernetes/fsx-storage.yaml similarity index 100% rename from 3.test_cases/pytorch/nvrx/kubernetes/fsx-storage.yaml rename to examples/training/nvrx/kubernetes/fsx-storage.yaml diff --git a/3.test_cases/pytorch/nvrx/kubernetes/namespace.yaml b/examples/training/nvrx/kubernetes/namespace.yaml similarity index 100% rename from 3.test_cases/pytorch/nvrx/kubernetes/namespace.yaml rename to examples/training/nvrx/kubernetes/namespace.yaml diff --git a/3.test_cases/pytorch/nvrx/kubernetes/secret-hf-token.yaml b/examples/training/nvrx/kubernetes/secret-hf-token.yaml similarity index 100% rename from 3.test_cases/pytorch/nvrx/kubernetes/secret-hf-token.yaml rename to examples/training/nvrx/kubernetes/secret-hf-token.yaml diff --git a/3.test_cases/pytorch/nvrx/kubernetes/training-job-async-ckpt-baseline.yaml b/examples/training/nvrx/kubernetes/training-job-async-ckpt-baseline.yaml similarity index 100% rename from 3.test_cases/pytorch/nvrx/kubernetes/training-job-async-ckpt-baseline.yaml rename to examples/training/nvrx/kubernetes/training-job-async-ckpt-baseline.yaml diff --git a/3.test_cases/pytorch/nvrx/kubernetes/training-job-async-ckpt.yaml b/examples/training/nvrx/kubernetes/training-job-async-ckpt.yaml similarity index 100% rename from 3.test_cases/pytorch/nvrx/kubernetes/training-job-async-ckpt.yaml rename to examples/training/nvrx/kubernetes/training-job-async-ckpt.yaml diff --git a/3.test_cases/pytorch/nvrx/kubernetes/training-job-ft-launcher-inprocess.yaml b/examples/training/nvrx/kubernetes/training-job-ft-launcher-inprocess.yaml similarity index 100% rename from 3.test_cases/pytorch/nvrx/kubernetes/training-job-ft-launcher-inprocess.yaml rename to examples/training/nvrx/kubernetes/training-job-ft-launcher-inprocess.yaml diff --git a/3.test_cases/pytorch/nvrx/kubernetes/training-job-ft-launcher.yaml b/examples/training/nvrx/kubernetes/training-job-ft-launcher.yaml similarity index 100% rename from 3.test_cases/pytorch/nvrx/kubernetes/training-job-ft-launcher.yaml rename to examples/training/nvrx/kubernetes/training-job-ft-launcher.yaml diff --git a/3.test_cases/pytorch/nvrx/kubernetes/training-job-inprocess-baseline.yaml b/examples/training/nvrx/kubernetes/training-job-inprocess-baseline.yaml similarity index 100% rename from 3.test_cases/pytorch/nvrx/kubernetes/training-job-inprocess-baseline.yaml rename to examples/training/nvrx/kubernetes/training-job-inprocess-baseline.yaml diff --git a/3.test_cases/pytorch/nvrx/kubernetes/training-job-inprocess.yaml b/examples/training/nvrx/kubernetes/training-job-inprocess.yaml similarity index 100% rename from 3.test_cases/pytorch/nvrx/kubernetes/training-job-inprocess.yaml rename to examples/training/nvrx/kubernetes/training-job-inprocess.yaml diff --git a/3.test_cases/pytorch/nvrx/kubernetes/training-job-local-ckpt-baseline.yaml b/examples/training/nvrx/kubernetes/training-job-local-ckpt-baseline.yaml similarity index 100% rename from 3.test_cases/pytorch/nvrx/kubernetes/training-job-local-ckpt-baseline.yaml rename to examples/training/nvrx/kubernetes/training-job-local-ckpt-baseline.yaml diff --git a/3.test_cases/pytorch/nvrx/kubernetes/training-job-local-ckpt.yaml b/examples/training/nvrx/kubernetes/training-job-local-ckpt.yaml similarity index 100% rename from 3.test_cases/pytorch/nvrx/kubernetes/training-job-local-ckpt.yaml rename to examples/training/nvrx/kubernetes/training-job-local-ckpt.yaml diff --git a/3.test_cases/pytorch/nvrx/prepare_dataset.py b/examples/training/nvrx/prepare_dataset.py similarity index 100% rename from 3.test_cases/pytorch/nvrx/prepare_dataset.py rename to examples/training/nvrx/prepare_dataset.py diff --git a/3.test_cases/pytorch/nvrx/requirements.txt b/examples/training/nvrx/requirements.txt similarity index 100% rename from 3.test_cases/pytorch/nvrx/requirements.txt rename to examples/training/nvrx/requirements.txt diff --git a/3.test_cases/pytorch/nvrx/src/distributed_utils.py b/examples/training/nvrx/src/distributed_utils.py similarity index 100% rename from 3.test_cases/pytorch/nvrx/src/distributed_utils.py rename to examples/training/nvrx/src/distributed_utils.py diff --git a/3.test_cases/pytorch/nvrx/src/failure_simulator.py b/examples/training/nvrx/src/failure_simulator.py similarity index 100% rename from 3.test_cases/pytorch/nvrx/src/failure_simulator.py rename to examples/training/nvrx/src/failure_simulator.py diff --git a/3.test_cases/pytorch/nvrx/src/fsdp_config.py b/examples/training/nvrx/src/fsdp_config.py similarity index 100% rename from 3.test_cases/pytorch/nvrx/src/fsdp_config.py rename to examples/training/nvrx/src/fsdp_config.py diff --git a/3.test_cases/pytorch/nvrx/src/metrics_collector.py b/examples/training/nvrx/src/metrics_collector.py similarity index 100% rename from 3.test_cases/pytorch/nvrx/src/metrics_collector.py rename to examples/training/nvrx/src/metrics_collector.py diff --git a/3.test_cases/pytorch/nvrx/src/train_async_ckpt.py b/examples/training/nvrx/src/train_async_ckpt.py similarity index 100% rename from 3.test_cases/pytorch/nvrx/src/train_async_ckpt.py rename to examples/training/nvrx/src/train_async_ckpt.py diff --git a/3.test_cases/pytorch/nvrx/src/train_ft_launcher.py b/examples/training/nvrx/src/train_ft_launcher.py similarity index 100% rename from 3.test_cases/pytorch/nvrx/src/train_ft_launcher.py rename to examples/training/nvrx/src/train_ft_launcher.py diff --git a/3.test_cases/pytorch/nvrx/src/train_inprocess.py b/examples/training/nvrx/src/train_inprocess.py similarity index 100% rename from 3.test_cases/pytorch/nvrx/src/train_inprocess.py rename to examples/training/nvrx/src/train_inprocess.py diff --git a/3.test_cases/pytorch/nvrx/src/train_local_ckpt.py b/examples/training/nvrx/src/train_local_ckpt.py similarity index 100% rename from 3.test_cases/pytorch/nvrx/src/train_local_ckpt.py rename to examples/training/nvrx/src/train_local_ckpt.py diff --git a/3.test_cases/pytorch/openrlhf/Dockerfile b/examples/training/openrlhf/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/openrlhf/Dockerfile rename to examples/training/openrlhf/Dockerfile diff --git a/3.test_cases/pytorch/openrlhf/README.md b/examples/training/openrlhf/README.md similarity index 100% rename from 3.test_cases/pytorch/openrlhf/README.md rename to examples/training/openrlhf/README.md diff --git a/3.test_cases/pytorch/openrlhf/buildspec.yml b/examples/training/openrlhf/buildspec.yml similarity index 100% rename from 3.test_cases/pytorch/openrlhf/buildspec.yml rename to examples/training/openrlhf/buildspec.yml diff --git a/3.test_cases/pytorch/openrlhf/hyperpod-eks/rlvr/recipe/evaluate_gptoss.py b/examples/training/openrlhf/hyperpod-eks/rlvr/recipe/evaluate_gptoss.py similarity index 100% rename from 3.test_cases/pytorch/openrlhf/hyperpod-eks/rlvr/recipe/evaluate_gptoss.py rename to examples/training/openrlhf/hyperpod-eks/rlvr/recipe/evaluate_gptoss.py diff --git a/3.test_cases/pytorch/openrlhf/hyperpod-eks/rlvr/recipe/evaluate_gptoss.sh b/examples/training/openrlhf/hyperpod-eks/rlvr/recipe/evaluate_gptoss.sh similarity index 100% rename from 3.test_cases/pytorch/openrlhf/hyperpod-eks/rlvr/recipe/evaluate_gptoss.sh rename to examples/training/openrlhf/hyperpod-eks/rlvr/recipe/evaluate_gptoss.sh diff --git a/3.test_cases/pytorch/openrlhf/hyperpod-eks/rlvr/recipe/language_reward.py b/examples/training/openrlhf/hyperpod-eks/rlvr/recipe/language_reward.py similarity index 100% rename from 3.test_cases/pytorch/openrlhf/hyperpod-eks/rlvr/recipe/language_reward.py rename to examples/training/openrlhf/hyperpod-eks/rlvr/recipe/language_reward.py diff --git a/3.test_cases/pytorch/openrlhf/hyperpod-eks/rlvr/recipe/run_gptoss_grpo.sh b/examples/training/openrlhf/hyperpod-eks/rlvr/recipe/run_gptoss_grpo.sh similarity index 100% rename from 3.test_cases/pytorch/openrlhf/hyperpod-eks/rlvr/recipe/run_gptoss_grpo.sh rename to examples/training/openrlhf/hyperpod-eks/rlvr/recipe/run_gptoss_grpo.sh diff --git a/3.test_cases/pytorch/openrlhf/hyperpod-eks/rlvr/setup/env_vars.example b/examples/training/openrlhf/hyperpod-eks/rlvr/setup/env_vars.example similarity index 100% rename from 3.test_cases/pytorch/openrlhf/hyperpod-eks/rlvr/setup/env_vars.example rename to examples/training/openrlhf/hyperpod-eks/rlvr/setup/env_vars.example diff --git a/3.test_cases/pytorch/openrlhf/hyperpod-eks/rlvr/setup/load_data_gptoss.sh b/examples/training/openrlhf/hyperpod-eks/rlvr/setup/load_data_gptoss.sh similarity index 100% rename from 3.test_cases/pytorch/openrlhf/hyperpod-eks/rlvr/setup/load_data_gptoss.sh rename to examples/training/openrlhf/hyperpod-eks/rlvr/setup/load_data_gptoss.sh diff --git a/3.test_cases/pytorch/openrlhf/hyperpod-eks/rlvr/setup/raycluster.yaml b/examples/training/openrlhf/hyperpod-eks/rlvr/setup/raycluster.yaml similarity index 100% rename from 3.test_cases/pytorch/openrlhf/hyperpod-eks/rlvr/setup/raycluster.yaml rename to examples/training/openrlhf/hyperpod-eks/rlvr/setup/raycluster.yaml diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/Dockerfile b/examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/Dockerfile rename to examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/Dockerfile diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/README.md b/examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/README.md similarity index 97% rename from 3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/README.md rename to examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/README.md index 7d79af877..60e15d1fb 100644 --- a/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/README.md +++ b/examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/README.md @@ -37,7 +37,7 @@ This solution uses: ### 0.1. EKS Cluster -Before running this training, you'll need an Amazon EKS or SageMaker HyperPod EKS cluster with at least 1 Trainium node (trn1.32xlarge, trn1n.32xlarge, or trn2.48xlarge). Instructions can be found in [1.architectures](../../1.architectures), the [aws-do-eks](https://bit.ly/do-eks) project, or the [eks-blueprints](https://github.com/aws-ia/terraform-aws-eks-blueprints) project. +Before running this training, you'll need an Amazon EKS or SageMaker HyperPod EKS cluster with at least 1 Trainium node (trn1.32xlarge, trn1n.32xlarge, or trn2.48xlarge). Instructions can be found in [architectures](../../../../../../architectures), the [aws-do-eks](https://bit.ly/do-eks) project, or the [eks-blueprints](https://github.com/aws-ia/terraform-aws-eks-blueprints) project. ### 0.2. Setup Persistent Volume Claim (PVC) for FSx diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/generate-jobspec.sh b/examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/generate-jobspec.sh similarity index 100% rename from 3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/generate-jobspec.sh rename to examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/generate-jobspec.sh diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/compile_peft.yaml-template b/examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/templates/compile_peft.yaml-template similarity index 100% rename from 3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/compile_peft.yaml-template rename to examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/templates/compile_peft.yaml-template diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/consolidation.yaml-template b/examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/templates/consolidation.yaml-template similarity index 100% rename from 3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/consolidation.yaml-template rename to examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/templates/consolidation.yaml-template diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/download_model.yaml-template b/examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/templates/download_model.yaml-template similarity index 100% rename from 3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/download_model.yaml-template rename to examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/templates/download_model.yaml-template diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/launch_peft_train.yaml-template b/examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/templates/launch_peft_train.yaml-template similarity index 100% rename from 3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/launch_peft_train.yaml-template rename to examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/templates/launch_peft_train.yaml-template diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/merge_lora.yaml-template b/examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/templates/merge_lora.yaml-template similarity index 100% rename from 3.test_cases/pytorch/optimum-neuron/llama3/kubernetes/fine-tuning/templates/merge_lora.yaml-template rename to examples/training/optimum-neuron/llama3/kubernetes/fine-tuning/templates/merge_lora.yaml-template diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/README.md b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/README.md similarity index 97% rename from 3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/README.md rename to examples/training/optimum-neuron/llama3/slurm/fine-tuning/README.md index 3ee375ee1..5acb7e6a7 100644 --- a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/README.md +++ b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/README.md @@ -40,13 +40,13 @@ You will also need to complete the following prerequisites: ## Step 1: Download Training Scripts -Begin by downloading the training scripts from the awsome-distributed-training repo: +Begin by downloading the training scripts from the awsome-distributed-ai repo: ```bash cd ~/ -git clone https://github.com/awslabs/awsome-distributed-training +git clone https://github.com/awslabs/awsome-distributed-ai -cd ~/awsome-distributed-training/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning +cd ~/awsome-distributed-ai/examples/training/optimum-neuron/llama3/slurm/fine-tuning ``` ## Step 2: Setup Python Environment diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/finetune-llama3-8B.sh b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/finetune-llama3-8B.sh similarity index 96% rename from 3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/finetune-llama3-8B.sh rename to examples/training/optimum-neuron/llama3/slurm/fine-tuning/finetune-llama3-8B.sh index fc5749ffe..5e60961e1 100644 --- a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/finetune-llama3-8B.sh +++ b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/finetune-llama3-8B.sh @@ -64,7 +64,7 @@ declare -a TORCHRUN_ARGS=( --nnodes=$SLURM_JOB_NUM_NODES ) -export TRAIN_SCRIPT=/fsx/ubuntu/awsome-distributed-training/3.test_cases/pytorch/optimum-neuron/llama3/src/train.py +export TRAIN_SCRIPT=/fsx/ubuntu/awsome-distributed-ai/examples/training/optimum-neuron/llama3/src/train.py ############################ ##### Training Params ###### diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/0.create_env.sh b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/0.create_env.sh similarity index 100% rename from 3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/0.create_env.sh rename to examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/0.create_env.sh diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/1.download_model.sh b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/1.download_model.sh similarity index 82% rename from 3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/1.download_model.sh rename to examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/1.download_model.sh index 8188fbdb3..e4f4c59d4 100755 --- a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/1.download_model.sh +++ b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/1.download_model.sh @@ -7,7 +7,7 @@ export OMP_NUM_THREADS=1 export HF_TOKEN="" -INPUT_PATH="/fsx/ubuntu/awsome-distributed-training/3.test_cases/pytorch/optimum-neuron/llama3/src/get_model.py" +INPUT_PATH="/fsx/ubuntu/awsome-distributed-ai/examples/training/optimum-neuron/llama3/src/get_model.py" MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" MODEL_OUTPUT_PATH="/fsx/ubuntu/peft_ft/model_artifacts/llama3-8B" TOKENIZER_OUTPUT_PATH="/fsx/ubuntu/peft_ft/tokenizer/llama3-8B" diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/2.compile_model.sh b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/2.compile_model.sh similarity index 60% rename from 3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/2.compile_model.sh rename to examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/2.compile_model.sh index 04fc624b6..100caeca8 100755 --- a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/2.compile_model.sh +++ b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/2.compile_model.sh @@ -7,4 +7,4 @@ export OMP_NUM_THREADS=1 export NEURON_EXTRACT_GRAPHS_ONLY=1 -srun bash /fsx/ubuntu/awsome-distributed-training/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/finetune-llama3-8B.sh +srun bash /fsx/ubuntu/awsome-distributed-ai/examples/training/optimum-neuron/llama3/slurm/fine-tuning/finetune-llama3-8B.sh diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/3.finetune.sh b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/3.finetune.sh similarity index 70% rename from 3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/3.finetune.sh rename to examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/3.finetune.sh index 29d3419e6..036d407f6 100755 --- a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/3.finetune.sh +++ b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/3.finetune.sh @@ -12,4 +12,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then echo "Detected Hyperpod cluster.. enabling --auto-resume=1" AUTO_RESUME="--auto-resume=1" fi -srun ${AUTO_RESUME} bash /fsx/ubuntu/awsome-distributed-training/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/finetune-llama3-8B.sh +srun ${AUTO_RESUME} bash /fsx/ubuntu/awsome-distributed-ai/examples/training/optimum-neuron/llama3/slurm/fine-tuning/finetune-llama3-8B.sh diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/4.model_consolidation.sh b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/4.model_consolidation.sh similarity index 74% rename from 3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/4.model_consolidation.sh rename to examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/4.model_consolidation.sh index d3158d98a..5e3a53147 100755 --- a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/4.model_consolidation.sh +++ b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/4.model_consolidation.sh @@ -6,7 +6,7 @@ export OMP_NUM_THREADS=1 -srun python3 "/fsx/ubuntu/awsome-distributed-training/3.test_cases/pytorch/optimum-neuron/llama3/src/model_consolidation.py" \ +srun python3 "/fsx/ubuntu/awsome-distributed-ai/examples/training/optimum-neuron/llama3/src/model_consolidation.py" \ --input_dir "/fsx/ubuntu/peft_ft/model_checkpoints/checkpoint-1251" \ --output_dir "/fsx/ubuntu/peft_ft/model_checkpoints/adapter_shards_consolidation"\ --save_format "safetensors" \ No newline at end of file diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/5.merge_lora_weights.sh b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/5.merge_lora_weights.sh similarity index 81% rename from 3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/5.merge_lora_weights.sh rename to examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/5.merge_lora_weights.sh index 53b0061ca..b5273a791 100755 --- a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/5.merge_lora_weights.sh +++ b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/5.merge_lora_weights.sh @@ -6,7 +6,7 @@ export OMP_NUM_THREADS=1 -srun python3 "/fsx/ubuntu/awsome-distributed-training/3.test_cases/pytorch/optimum-neuron/llama3/src/merge_lora_weights.py" \ +srun python3 "/fsx/ubuntu/awsome-distributed-ai/examples/training/optimum-neuron/llama3/src/merge_lora_weights.py" \ --final_model_path "/fsx/ubuntu/peft_ft/model_checkpoints/final_model_output" \ --adapter_config_path "/fsx/ubuntu/peft_ft/model_checkpoints/checkpoint-1251/adapter_config.json"\ --base_model_path "/fsx/ubuntu/peft_ft/model_artifacts/llama3-8B" \ diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/6.inference.sh b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/6.inference.sh similarity index 73% rename from 3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/6.inference.sh rename to examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/6.inference.sh index 6edb181a4..5e61a5719 100755 --- a/3.test_cases/pytorch/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/6.inference.sh +++ b/examples/training/optimum-neuron/llama3/slurm/fine-tuning/submit_jobs/6.inference.sh @@ -7,6 +7,6 @@ export OMP_NUM_THREADS=1 export HF_TOKEN="" -srun python3 "/fsx/ubuntu/awsome-distributed-training/3.test_cases/pytorch/optimum-neuron/llama3/src/run_inference.py" \ +srun python3 "/fsx/ubuntu/awsome-distributed-ai/examples/training/optimum-neuron/llama3/src/run_inference.py" \ --model_path "/fsx/ubuntu/peft_ft/model_checkpoints/final_model_output" \ --model_id "meta-llama/Meta-Llama-3-8B-Instruct" \ No newline at end of file diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/src/get_model.py b/examples/training/optimum-neuron/llama3/src/get_model.py similarity index 100% rename from 3.test_cases/pytorch/optimum-neuron/llama3/src/get_model.py rename to examples/training/optimum-neuron/llama3/src/get_model.py diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/src/merge_lora_weights.py b/examples/training/optimum-neuron/llama3/src/merge_lora_weights.py similarity index 100% rename from 3.test_cases/pytorch/optimum-neuron/llama3/src/merge_lora_weights.py rename to examples/training/optimum-neuron/llama3/src/merge_lora_weights.py diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/src/model_consolidation.py b/examples/training/optimum-neuron/llama3/src/model_consolidation.py similarity index 100% rename from 3.test_cases/pytorch/optimum-neuron/llama3/src/model_consolidation.py rename to examples/training/optimum-neuron/llama3/src/model_consolidation.py diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/src/peft_tokenize_data.py b/examples/training/optimum-neuron/llama3/src/peft_tokenize_data.py similarity index 100% rename from 3.test_cases/pytorch/optimum-neuron/llama3/src/peft_tokenize_data.py rename to examples/training/optimum-neuron/llama3/src/peft_tokenize_data.py diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/src/run_inference.py b/examples/training/optimum-neuron/llama3/src/run_inference.py similarity index 100% rename from 3.test_cases/pytorch/optimum-neuron/llama3/src/run_inference.py rename to examples/training/optimum-neuron/llama3/src/run_inference.py diff --git a/3.test_cases/pytorch/optimum-neuron/llama3/src/train.py b/examples/training/optimum-neuron/llama3/src/train.py similarity index 100% rename from 3.test_cases/pytorch/optimum-neuron/llama3/src/train.py rename to examples/training/optimum-neuron/llama3/src/train.py diff --git a/3.test_cases/pytorch/picotron/.gitignore b/examples/training/picotron/.gitignore similarity index 100% rename from 3.test_cases/pytorch/picotron/.gitignore rename to examples/training/picotron/.gitignore diff --git a/3.test_cases/pytorch/picotron/README.md b/examples/training/picotron/README.md similarity index 100% rename from 3.test_cases/pytorch/picotron/README.md rename to examples/training/picotron/README.md diff --git a/3.test_cases/pytorch/picotron/SmolLM-1.7B/README.md b/examples/training/picotron/SmolLM-1.7B/README.md similarity index 100% rename from 3.test_cases/pytorch/picotron/SmolLM-1.7B/README.md rename to examples/training/picotron/SmolLM-1.7B/README.md diff --git a/3.test_cases/pytorch/picotron/SmolLM-1.7B/ec2/README.md b/examples/training/picotron/SmolLM-1.7B/ec2/README.md similarity index 100% rename from 3.test_cases/pytorch/picotron/SmolLM-1.7B/ec2/README.md rename to examples/training/picotron/SmolLM-1.7B/ec2/README.md diff --git a/3.test_cases/pytorch/picotron/SmolLM-1.7B/slurm/README.md b/examples/training/picotron/SmolLM-1.7B/slurm/README.md similarity index 98% rename from 3.test_cases/pytorch/picotron/SmolLM-1.7B/slurm/README.md rename to examples/training/picotron/SmolLM-1.7B/slurm/README.md index 9cd3a3168..dc62c4177 100644 --- a/3.test_cases/pytorch/picotron/SmolLM-1.7B/slurm/README.md +++ b/examples/training/picotron/SmolLM-1.7B/slurm/README.md @@ -6,7 +6,7 @@ This guide demonstrates how to run distributed training across two GPU instances 1. **Cluster Setup** - A Slurm cluster on AWS with at least two GPU compute nodes - - We recommend using either AWS ParallelCluster or SageMaker HyperPod with our provided templates in the [architectures directory](../../../1.architectures) + - We recommend using either AWS ParallelCluster or SageMaker HyperPod with our provided templates in the [architectures directory](../../../../../architectures) 2. **Node Requirements** - Docker diff --git a/3.test_cases/pytorch/picotron/SmolLM-1.7B/slurm/train.sbatch b/examples/training/picotron/SmolLM-1.7B/slurm/train.sbatch similarity index 95% rename from 3.test_cases/pytorch/picotron/SmolLM-1.7B/slurm/train.sbatch rename to examples/training/picotron/SmolLM-1.7B/slurm/train.sbatch index 0c2a3f809..f01ca1fc4 100644 --- a/3.test_cases/pytorch/picotron/SmolLM-1.7B/slurm/train.sbatch +++ b/examples/training/picotron/SmolLM-1.7B/slurm/train.sbatch @@ -20,7 +20,7 @@ GPUS_PER_NODE=1 ######## EFA / NCCL ####### ########################### -## See 1.architectures/efa-cheatsheet.md for full reference +## See architectures/efa-cheatsheet.md for full reference export FI_EFA_USE_HUGE_PAGE=0 export FI_PROVIDER=efa export NCCL_SOCKET_IFNAME=^docker,lo,veth diff --git a/3.test_cases/pytorch/picotron/create_config.py b/examples/training/picotron/create_config.py similarity index 100% rename from 3.test_cases/pytorch/picotron/create_config.py rename to examples/training/picotron/create_config.py diff --git a/3.test_cases/pytorch/picotron/picotron.Dockerfile b/examples/training/picotron/picotron.Dockerfile similarity index 100% rename from 3.test_cases/pytorch/picotron/picotron.Dockerfile rename to examples/training/picotron/picotron.Dockerfile diff --git a/3.test_cases/pytorch/picotron/template/base_config.json b/examples/training/picotron/template/base_config.json similarity index 100% rename from 3.test_cases/pytorch/picotron/template/base_config.json rename to examples/training/picotron/template/base_config.json diff --git a/3.test_cases/pytorch/picotron/train.py b/examples/training/picotron/train.py similarity index 100% rename from 3.test_cases/pytorch/picotron/train.py rename to examples/training/picotron/train.py diff --git a/3.test_cases/pytorch/torchtitan/README.md b/examples/training/torchtitan/README.md similarity index 100% rename from 3.test_cases/pytorch/torchtitan/README.md rename to examples/training/torchtitan/README.md diff --git a/3.test_cases/pytorch/torchtitan/slurm/.gitignore b/examples/training/torchtitan/slurm/.gitignore similarity index 100% rename from 3.test_cases/pytorch/torchtitan/slurm/.gitignore rename to examples/training/torchtitan/slurm/.gitignore diff --git a/3.test_cases/pytorch/torchtitan/slurm/0.create_venv.sh b/examples/training/torchtitan/slurm/0.create_venv.sh similarity index 96% rename from 3.test_cases/pytorch/torchtitan/slurm/0.create_venv.sh rename to examples/training/torchtitan/slurm/0.create_venv.sh index 3a9ed5a23..74df2aaa4 100755 --- a/3.test_cases/pytorch/torchtitan/slurm/0.create_venv.sh +++ b/examples/training/torchtitan/slurm/0.create_venv.sh @@ -4,7 +4,7 @@ set -euo pipefail -# torchtitan host environment for awsome-distributed-training. +# torchtitan host environment for awsome-distributed-ai. # # Replaces the previous Miniconda-based setup with a stdlib `python -m venv` # environment, and pins all components to released versions instead of diff --git a/3.test_cases/pytorch/torchtitan/slurm/1.llama_3_8b_torchtitan.sh b/examples/training/torchtitan/slurm/1.llama_3_8b_torchtitan.sh similarity index 100% rename from 3.test_cases/pytorch/torchtitan/slurm/1.llama_3_8b_torchtitan.sh rename to examples/training/torchtitan/slurm/1.llama_3_8b_torchtitan.sh diff --git a/3.test_cases/pytorch/torchtitan/slurm/README.md b/examples/training/torchtitan/slurm/README.md similarity index 94% rename from 3.test_cases/pytorch/torchtitan/slurm/README.md rename to examples/training/torchtitan/slurm/README.md index 285dd1a5d..3279d108d 100644 --- a/3.test_cases/pytorch/torchtitan/slurm/README.md +++ b/examples/training/torchtitan/slurm/README.md @@ -2,7 +2,7 @@ ### 0. Prerequisites -Before running this training, you'll need to create a Slurm cluster with an FSx for Lustre file system. Instructions can be found in [1.architectures](../../../1.architectures). FP8 data types are natively supported on NVIDIA H100 and subsequent generations, so it is recommended to run this on at least 1 x p5/p5e/p5en.48xlarge (H100) or p6-b200/p6-b300 (Blackwell) instance. The [Performance Numbers](#performance-numbers) section was originally captured on 4 x p5.48xlarge. +Before running this training, you'll need to create a Slurm cluster with an FSx for Lustre file system. Instructions can be found in [architectures](../../../../architectures). FP8 data types are natively supported on NVIDIA H100 and subsequent generations, so it is recommended to run this on at least 1 x p5/p5e/p5en.48xlarge (H100) or p6-b200/p6-b300 (Blackwell) instance. The [Performance Numbers](#performance-numbers) section was originally captured on 4 x p5.48xlarge. The setup script targets CUDA 13 (`cu130`) wheels so that `torch.compile`-ed FP8 kernels run with native `sm_103` binaries on P6-B300; older drivers/CUDA toolkits will fall back to PTX-JIT for B300. diff --git a/3.test_cases/pytorch/torchtitan/slurm/configs/llama3_8b.toml b/examples/training/torchtitan/slurm/configs/llama3_8b.toml similarity index 100% rename from 3.test_cases/pytorch/torchtitan/slurm/configs/llama3_8b.toml rename to examples/training/torchtitan/slurm/configs/llama3_8b.toml diff --git a/3.test_cases/pytorch/torchtitan/slurm/configs/llama3_8b_fp8_compile.toml b/examples/training/torchtitan/slurm/configs/llama3_8b_fp8_compile.toml similarity index 100% rename from 3.test_cases/pytorch/torchtitan/slurm/configs/llama3_8b_fp8_compile.toml rename to examples/training/torchtitan/slurm/configs/llama3_8b_fp8_compile.toml diff --git a/3.test_cases/pytorch/trl/Dockerfile b/examples/training/trl/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/trl/Dockerfile rename to examples/training/trl/Dockerfile diff --git a/3.test_cases/pytorch/trl/README.md b/examples/training/trl/README.md similarity index 90% rename from 3.test_cases/pytorch/trl/README.md rename to examples/training/trl/README.md index 573e8a53a..9bd7af9de 100644 --- a/3.test_cases/pytorch/trl/README.md +++ b/examples/training/trl/README.md @@ -9,7 +9,7 @@ All test cases share a common base Docker image defined in [`Dockerfile`](Docker Build the shared base image: ```bash -cd 3.test_cases/pytorch/trl +cd examples/training/trl docker build -t trl-base:latest . ``` @@ -22,7 +22,7 @@ docker build -t trl-base:latest . ## Prerequisites -- GPU cluster with EFA networking (see [`1.architectures/`](../../../1.architectures/) for cluster setup) +- GPU cluster with EFA networking (see [`architectures/`](../../../architectures/) for cluster setup) - Shared filesystem (e.g., Amazon FSx for Lustre) accessible from all nodes - [Enroot](https://github.com/NVIDIA/enroot) and [Pyxis](https://github.com/NVIDIA/pyxis) for Slurm container execution - Hugging Face model access tokens configured via `HF_HOME` diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/README.md b/examples/training/trl/gpt-oss-lora-grpo/README.md similarity index 97% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/README.md rename to examples/training/trl/gpt-oss-lora-grpo/README.md index 523214dcf..e05d85364 100644 --- a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/README.md +++ b/examples/training/trl/gpt-oss-lora-grpo/README.md @@ -22,7 +22,7 @@ This guide explains how to train the GPT-OSS 20B model with LoRA, then improve i ### 0.1. EKS Cluster -Before running this training, you'll need to create an Amazon EKS or a SageMaker HyperPod EKS cluster. Instructions can be found in [1.architectures](../../../../1.architectures), the [aws-do-eks](https://github.com/aws-samples/aws-do-eks) project, or the [eks-blueprints](https://github.com/aws-ia/terraform-aws-eks-blueprints) project. +Before running this training, you'll need to create an Amazon EKS or a SageMaker HyperPod EKS cluster. Instructions can be found in [architectures](../../../../architectures), the [aws-do-eks](https://github.com/aws-samples/aws-do-eks) project, or the [eks-blueprints](https://github.com/aws-ia/terraform-aws-eks-blueprints) project. ### 0.2. Connect to your EKS Cluster @@ -45,7 +45,7 @@ arn:aws:eks:us-east-2:xxxxxxxxxxxx:cluster/xxx-eks-cluster ### 0.3. Clone the repository ```bash -git clone https://github.com/awslabs/awsome-distributed-training/ +git clone https://github.com/awslabs/awsome-distributed-ai/ ``` ## 1. Build container image diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/Dockerfile b/examples/training/trl/gpt-oss-lora-grpo/artifacts/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/Dockerfile rename to examples/training/trl/gpt-oss-lora-grpo/artifacts/Dockerfile diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/build_push.sh b/examples/training/trl/gpt-oss-lora-grpo/artifacts/build_push.sh similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/build_push.sh rename to examples/training/trl/gpt-oss-lora-grpo/artifacts/build_push.sh diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/__init__.py b/examples/training/trl/gpt-oss-lora-grpo/artifacts/src/__init__.py similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/__init__.py rename to examples/training/trl/gpt-oss-lora-grpo/artifacts/src/__init__.py diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/configs/sft_lora.yaml b/examples/training/trl/gpt-oss-lora-grpo/artifacts/src/configs/sft_lora.yaml similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/configs/sft_lora.yaml rename to examples/training/trl/gpt-oss-lora-grpo/artifacts/src/configs/sft_lora.yaml diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/convert_fsdp_checkpoint.py b/examples/training/trl/gpt-oss-lora-grpo/artifacts/src/convert_fsdp_checkpoint.py similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/convert_fsdp_checkpoint.py rename to examples/training/trl/gpt-oss-lora-grpo/artifacts/src/convert_fsdp_checkpoint.py diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/convert_grpo_checkpoint.py b/examples/training/trl/gpt-oss-lora-grpo/artifacts/src/convert_grpo_checkpoint.py similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/convert_grpo_checkpoint.py rename to examples/training/trl/gpt-oss-lora-grpo/artifacts/src/convert_grpo_checkpoint.py diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/evaluate_grpo.py b/examples/training/trl/gpt-oss-lora-grpo/artifacts/src/evaluate_grpo.py similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/evaluate_grpo.py rename to examples/training/trl/gpt-oss-lora-grpo/artifacts/src/evaluate_grpo.py diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/grpo_singlenode.py b/examples/training/trl/gpt-oss-lora-grpo/artifacts/src/grpo_singlenode.py similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/grpo_singlenode.py rename to examples/training/trl/gpt-oss-lora-grpo/artifacts/src/grpo_singlenode.py diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/inference_g6e.py b/examples/training/trl/gpt-oss-lora-grpo/artifacts/src/inference_g6e.py similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/inference_g6e.py rename to examples/training/trl/gpt-oss-lora-grpo/artifacts/src/inference_g6e.py diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/inference_grpo_new.py b/examples/training/trl/gpt-oss-lora-grpo/artifacts/src/inference_grpo_new.py similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/inference_grpo_new.py rename to examples/training/trl/gpt-oss-lora-grpo/artifacts/src/inference_grpo_new.py diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/sft.py b/examples/training/trl/gpt-oss-lora-grpo/artifacts/src/sft.py similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/sft.py rename to examples/training/trl/gpt-oss-lora-grpo/artifacts/src/sft.py diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/sft_teacher_data.py b/examples/training/trl/gpt-oss-lora-grpo/artifacts/src/sft_teacher_data.py similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/artifacts/src/sft_teacher_data.py rename to examples/training/trl/gpt-oss-lora-grpo/artifacts/src/sft_teacher_data.py diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/env_vars.example b/examples/training/trl/gpt-oss-lora-grpo/env_vars.example similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/env_vars.example rename to examples/training/trl/gpt-oss-lora-grpo/env_vars.example diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/hyperpod-eks/train-lora-hyperpod-elastic-g6e.yaml b/examples/training/trl/gpt-oss-lora-grpo/hyperpod-eks/train-lora-hyperpod-elastic-g6e.yaml similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/hyperpod-eks/train-lora-hyperpod-elastic-g6e.yaml rename to examples/training/trl/gpt-oss-lora-grpo/hyperpod-eks/train-lora-hyperpod-elastic-g6e.yaml diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/kubernetes/eval-grpo.yaml b/examples/training/trl/gpt-oss-lora-grpo/kubernetes/eval-grpo.yaml similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/kubernetes/eval-grpo.yaml rename to examples/training/trl/gpt-oss-lora-grpo/kubernetes/eval-grpo.yaml diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/kubernetes/fsx-storage-manager.yaml b/examples/training/trl/gpt-oss-lora-grpo/kubernetes/fsx-storage-manager.yaml similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/kubernetes/fsx-storage-manager.yaml rename to examples/training/trl/gpt-oss-lora-grpo/kubernetes/fsx-storage-manager.yaml diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/kubernetes/inference-g6e-base.yaml b/examples/training/trl/gpt-oss-lora-grpo/kubernetes/inference-g6e-base.yaml similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/kubernetes/inference-g6e-base.yaml rename to examples/training/trl/gpt-oss-lora-grpo/kubernetes/inference-g6e-base.yaml diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/kubernetes/inference-g6e-grpo.yaml b/examples/training/trl/gpt-oss-lora-grpo/kubernetes/inference-g6e-grpo.yaml similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/kubernetes/inference-g6e-grpo.yaml rename to examples/training/trl/gpt-oss-lora-grpo/kubernetes/inference-g6e-grpo.yaml diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/kubernetes/inference-g6e-trained.yaml b/examples/training/trl/gpt-oss-lora-grpo/kubernetes/inference-g6e-trained.yaml similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/kubernetes/inference-g6e-trained.yaml rename to examples/training/trl/gpt-oss-lora-grpo/kubernetes/inference-g6e-trained.yaml diff --git a/3.test_cases/pytorch/trl/gpt-oss-lora-grpo/kubernetes/train-grpo-singlenode.yaml b/examples/training/trl/gpt-oss-lora-grpo/kubernetes/train-grpo-singlenode.yaml similarity index 100% rename from 3.test_cases/pytorch/trl/gpt-oss-lora-grpo/kubernetes/train-grpo-singlenode.yaml rename to examples/training/trl/gpt-oss-lora-grpo/kubernetes/train-grpo-singlenode.yaml diff --git a/3.test_cases/pytorch/trl/grpo-math-reasoning/README.md b/examples/training/trl/grpo-math-reasoning/README.md similarity index 100% rename from 3.test_cases/pytorch/trl/grpo-math-reasoning/README.md rename to examples/training/trl/grpo-math-reasoning/README.md diff --git a/3.test_cases/pytorch/trl/grpo-math-reasoning/deepspeed_zero3.yaml b/examples/training/trl/grpo-math-reasoning/deepspeed_zero3.yaml similarity index 100% rename from 3.test_cases/pytorch/trl/grpo-math-reasoning/deepspeed_zero3.yaml rename to examples/training/trl/grpo-math-reasoning/deepspeed_zero3.yaml diff --git a/3.test_cases/pytorch/trl/grpo-math-reasoning/eval.py b/examples/training/trl/grpo-math-reasoning/eval.py similarity index 100% rename from 3.test_cases/pytorch/trl/grpo-math-reasoning/eval.py rename to examples/training/trl/grpo-math-reasoning/eval.py diff --git a/3.test_cases/pytorch/trl/grpo-math-reasoning/grpo_wandb.png b/examples/training/trl/grpo-math-reasoning/grpo_wandb.png similarity index 100% rename from 3.test_cases/pytorch/trl/grpo-math-reasoning/grpo_wandb.png rename to examples/training/trl/grpo-math-reasoning/grpo_wandb.png diff --git a/3.test_cases/pytorch/trl/grpo-math-reasoning/inference.py b/examples/training/trl/grpo-math-reasoning/inference.py similarity index 100% rename from 3.test_cases/pytorch/trl/grpo-math-reasoning/inference.py rename to examples/training/trl/grpo-math-reasoning/inference.py diff --git a/3.test_cases/pytorch/trl/grpo-math-reasoning/train.py b/examples/training/trl/grpo-math-reasoning/train.py similarity index 100% rename from 3.test_cases/pytorch/trl/grpo-math-reasoning/train.py rename to examples/training/trl/grpo-math-reasoning/train.py diff --git a/3.test_cases/pytorch/trl/grpo-math-reasoning/train.sbatch b/examples/training/trl/grpo-math-reasoning/train.sbatch similarity index 100% rename from 3.test_cases/pytorch/trl/grpo-math-reasoning/train.sbatch rename to examples/training/trl/grpo-math-reasoning/train.sbatch diff --git a/3.test_cases/pytorch/trl/openenv-wordle-grpo/Dockerfile b/examples/training/trl/openenv-wordle-grpo/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/trl/openenv-wordle-grpo/Dockerfile rename to examples/training/trl/openenv-wordle-grpo/Dockerfile diff --git a/3.test_cases/pytorch/trl/openenv-wordle-grpo/README.md b/examples/training/trl/openenv-wordle-grpo/README.md similarity index 98% rename from 3.test_cases/pytorch/trl/openenv-wordle-grpo/README.md rename to examples/training/trl/openenv-wordle-grpo/README.md index 7870279c3..44866e1b3 100644 --- a/3.test_cases/pytorch/trl/openenv-wordle-grpo/README.md +++ b/examples/training/trl/openenv-wordle-grpo/README.md @@ -95,7 +95,7 @@ For more details, see the [OpenEnv documentation](https://meta-pytorch.org/OpenE ## Prerequisites -1. **SageMaker HyperPod EKS cluster** with GPU worker groups (e.g. `ml.g6.12xlarge` or `ml.g6e.12xlarge`). See [`1.architectures/7.sagemaker-hyperpod-eks/`](../../../../1.architectures/7.sagemaker-hyperpod-eks/) for cluster setup instructions. +1. **SageMaker HyperPod EKS cluster** with GPU worker groups (e.g. `ml.g6.12xlarge` or `ml.g6e.12xlarge`). See [`architectures/sagemaker-hyperpod-eks/`](../../../../architectures/sagemaker-hyperpod-eks/) for cluster setup instructions. 2. **HyperPod Helm chart** (`hyperpod-dependencies`) installed in the `kube-system` namespace. This bundles the NVIDIA device plugin, health monitoring agents, and other HyperPod components: ```bash @@ -145,8 +145,8 @@ hyperpod-i-0f9e8d7c6b5a43210 Ready 2d v1.33.5-eks-ecaa3a6 Sch ### Clone the repository ```bash -git clone https://github.com/awslabs/awsome-distributed-training/ -cd awsome-distributed-training/3.test_cases/pytorch/trl/openenv-wordle-grpo +git clone https://github.com/awslabs/awsome-distributed-ai/ +cd awsome-distributed-ai/examples/training/trl/openenv-wordle-grpo ``` ## 1. Build Container Image diff --git a/3.test_cases/pytorch/trl/openenv-wordle-grpo/env_vars.example b/examples/training/trl/openenv-wordle-grpo/env_vars.example similarity index 100% rename from 3.test_cases/pytorch/trl/openenv-wordle-grpo/env_vars.example rename to examples/training/trl/openenv-wordle-grpo/env_vars.example diff --git a/3.test_cases/pytorch/trl/openenv-wordle-grpo/kubernetes/inference-wordle.yaml b/examples/training/trl/openenv-wordle-grpo/kubernetes/inference-wordle.yaml similarity index 100% rename from 3.test_cases/pytorch/trl/openenv-wordle-grpo/kubernetes/inference-wordle.yaml rename to examples/training/trl/openenv-wordle-grpo/kubernetes/inference-wordle.yaml diff --git a/3.test_cases/pytorch/trl/openenv-wordle-grpo/kubernetes/openenv-wordle-env.yaml b/examples/training/trl/openenv-wordle-grpo/kubernetes/openenv-wordle-env.yaml similarity index 100% rename from 3.test_cases/pytorch/trl/openenv-wordle-grpo/kubernetes/openenv-wordle-env.yaml rename to examples/training/trl/openenv-wordle-grpo/kubernetes/openenv-wordle-env.yaml diff --git a/3.test_cases/pytorch/trl/openenv-wordle-grpo/kubernetes/train-grpo-wordle-multigpu.yaml b/examples/training/trl/openenv-wordle-grpo/kubernetes/train-grpo-wordle-multigpu.yaml similarity index 100% rename from 3.test_cases/pytorch/trl/openenv-wordle-grpo/kubernetes/train-grpo-wordle-multigpu.yaml rename to examples/training/trl/openenv-wordle-grpo/kubernetes/train-grpo-wordle-multigpu.yaml diff --git a/3.test_cases/pytorch/trl/openenv-wordle-grpo/kubernetes/train-grpo-wordle.yaml b/examples/training/trl/openenv-wordle-grpo/kubernetes/train-grpo-wordle.yaml similarity index 100% rename from 3.test_cases/pytorch/trl/openenv-wordle-grpo/kubernetes/train-grpo-wordle.yaml rename to examples/training/trl/openenv-wordle-grpo/kubernetes/train-grpo-wordle.yaml diff --git a/3.test_cases/pytorch/trl/openenv-wordle-grpo/src/requirements.txt b/examples/training/trl/openenv-wordle-grpo/src/requirements.txt similarity index 100% rename from 3.test_cases/pytorch/trl/openenv-wordle-grpo/src/requirements.txt rename to examples/training/trl/openenv-wordle-grpo/src/requirements.txt diff --git a/3.test_cases/pytorch/trl/openenv-wordle-grpo/src/train_wordle_grpo.py b/examples/training/trl/openenv-wordle-grpo/src/train_wordle_grpo.py similarity index 100% rename from 3.test_cases/pytorch/trl/openenv-wordle-grpo/src/train_wordle_grpo.py rename to examples/training/trl/openenv-wordle-grpo/src/train_wordle_grpo.py diff --git a/3.test_cases/pytorch/trl/openenv-wordle-grpo/src/wordle_prompt.txt b/examples/training/trl/openenv-wordle-grpo/src/wordle_prompt.txt similarity index 100% rename from 3.test_cases/pytorch/trl/openenv-wordle-grpo/src/wordle_prompt.txt rename to examples/training/trl/openenv-wordle-grpo/src/wordle_prompt.txt diff --git a/3.test_cases/pytorch/verl/.gitignore b/examples/training/verl/.gitignore similarity index 100% rename from 3.test_cases/pytorch/verl/.gitignore rename to examples/training/verl/.gitignore diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/Dockerfile b/examples/training/verl/hyperpod-eks/rlvr/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/Dockerfile rename to examples/training/verl/hyperpod-eks/rlvr/Dockerfile diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/README.md b/examples/training/verl/hyperpod-eks/rlvr/README.md similarity index 98% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/README.md rename to examples/training/verl/hyperpod-eks/rlvr/README.md index 23ba555f2..5d26ac572 100644 --- a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/README.md +++ b/examples/training/verl/hyperpod-eks/rlvr/README.md @@ -30,8 +30,8 @@ The example was tested on versions: ### Clone this repo ```bash -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/3.test_cases/pytorch/verl/hyperpod-eks/rlvr +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/examples/training/verl/hyperpod-eks/rlvr ``` ### Install verl repository diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/img/ray-dashboard.png b/examples/training/verl/hyperpod-eks/rlvr/img/ray-dashboard.png similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/img/ray-dashboard.png rename to examples/training/verl/hyperpod-eks/rlvr/img/ray-dashboard.png diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/job-stop.sh b/examples/training/verl/hyperpod-eks/rlvr/job-stop.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/job-stop.sh rename to examples/training/verl/hyperpod-eks/rlvr/job-stop.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/Dockerfile b/examples/training/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/Dockerfile rename to examples/training/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/Dockerfile diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/README.md b/examples/training/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/README.md similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/README.md rename to examples/training/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/README.md diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/build-push.sh b/examples/training/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/build-push.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/build-push.sh rename to examples/training/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/build-push.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/mtc-grpo-cluster.yaml b/examples/training/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/mtc-grpo-cluster.yaml similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/mtc-grpo-cluster.yaml rename to examples/training/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/mtc-grpo-cluster.yaml diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/submit-mtc-grpo.sh b/examples/training/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/submit-mtc-grpo.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/submit-mtc-grpo.sh rename to examples/training/verl/hyperpod-eks/rlvr/managed-tiered-checkpointing/submit-mtc-grpo.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/observability/README.md b/examples/training/verl/hyperpod-eks/rlvr/observability/README.md similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/observability/README.md rename to examples/training/verl/hyperpod-eks/rlvr/observability/README.md diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/observability/add-ray-metrics.sh b/examples/training/verl/hyperpod-eks/rlvr/observability/add-ray-metrics.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/observability/add-ray-metrics.sh rename to examples/training/verl/hyperpod-eks/rlvr/observability/add-ray-metrics.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/ray-expose.sh b/examples/training/verl/hyperpod-eks/rlvr/ray-expose.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/ray-expose.sh rename to examples/training/verl/hyperpod-eks/rlvr/ray-expose.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/ray-hide.sh b/examples/training/verl/hyperpod-eks/rlvr/ray-hide.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/ray-hide.sh rename to examples/training/verl/hyperpod-eks/rlvr/ray-hide.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/recipe/evaluate_gptoss.py b/examples/training/verl/hyperpod-eks/rlvr/recipe/evaluate_gptoss.py similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/recipe/evaluate_gptoss.py rename to examples/training/verl/hyperpod-eks/rlvr/recipe/evaluate_gptoss.py diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/recipe/evaluate_gptoss.sh b/examples/training/verl/hyperpod-eks/rlvr/recipe/evaluate_gptoss.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/recipe/evaluate_gptoss.sh rename to examples/training/verl/hyperpod-eks/rlvr/recipe/evaluate_gptoss.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/recipe/language_reward.py b/examples/training/verl/hyperpod-eks/rlvr/recipe/language_reward.py similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/recipe/language_reward.py rename to examples/training/verl/hyperpod-eks/rlvr/recipe/language_reward.py diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/recipe/run_dapo_configurable.sh b/examples/training/verl/hyperpod-eks/rlvr/recipe/run_dapo_configurable.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/recipe/run_dapo_configurable.sh rename to examples/training/verl/hyperpod-eks/rlvr/recipe/run_dapo_configurable.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/recipe/run_gptoss_grpo.sh b/examples/training/verl/hyperpod-eks/rlvr/recipe/run_gptoss_grpo.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/recipe/run_gptoss_grpo.sh rename to examples/training/verl/hyperpod-eks/rlvr/recipe/run_gptoss_grpo.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/recipe/run_grpo_configurable.sh b/examples/training/verl/hyperpod-eks/rlvr/recipe/run_grpo_configurable.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/recipe/run_grpo_configurable.sh rename to examples/training/verl/hyperpod-eks/rlvr/recipe/run_grpo_configurable.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/IRSA-README.md b/examples/training/verl/hyperpod-eks/rlvr/setup/IRSA-README.md similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/IRSA-README.md rename to examples/training/verl/hyperpod-eks/rlvr/setup/IRSA-README.md diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/build-push.sh b/examples/training/verl/hyperpod-eks/rlvr/setup/build-push.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/build-push.sh rename to examples/training/verl/hyperpod-eks/rlvr/setup/build-push.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/env_vars.example b/examples/training/verl/hyperpod-eks/rlvr/setup/env_vars.example similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/env_vars.example rename to examples/training/verl/hyperpod-eks/rlvr/setup/env_vars.example diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/install-kuberay.sh b/examples/training/verl/hyperpod-eks/rlvr/setup/install-kuberay.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/install-kuberay.sh rename to examples/training/verl/hyperpod-eks/rlvr/setup/install-kuberay.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/load_data_dapo.sh b/examples/training/verl/hyperpod-eks/rlvr/setup/load_data_dapo.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/load_data_dapo.sh rename to examples/training/verl/hyperpod-eks/rlvr/setup/load_data_dapo.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/load_data_gptoss.sh b/examples/training/verl/hyperpod-eks/rlvr/setup/load_data_gptoss.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/load_data_gptoss.sh rename to examples/training/verl/hyperpod-eks/rlvr/setup/load_data_gptoss.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/load_data_grpo.sh b/examples/training/verl/hyperpod-eks/rlvr/setup/load_data_grpo.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/load_data_grpo.sh rename to examples/training/verl/hyperpod-eks/rlvr/setup/load_data_grpo.sh diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/raycluster.yaml b/examples/training/verl/hyperpod-eks/rlvr/setup/raycluster.yaml similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/raycluster.yaml rename to examples/training/verl/hyperpod-eks/rlvr/setup/raycluster.yaml diff --git a/3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/setup-irsa.sh b/examples/training/verl/hyperpod-eks/rlvr/setup/setup-irsa.sh similarity index 100% rename from 3.test_cases/pytorch/verl/hyperpod-eks/rlvr/setup/setup-irsa.sh rename to examples/training/verl/hyperpod-eks/rlvr/setup/setup-irsa.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/Dockerfile b/examples/training/verl/kubernetes/rlvr/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/Dockerfile rename to examples/training/verl/kubernetes/rlvr/Dockerfile diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/README.md b/examples/training/verl/kubernetes/rlvr/README.md similarity index 97% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/README.md rename to examples/training/verl/kubernetes/rlvr/README.md index e4f0233f7..8ce49ea0e 100644 --- a/3.test_cases/pytorch/verl/kubernetes/rlvr/README.md +++ b/examples/training/verl/kubernetes/rlvr/README.md @@ -30,8 +30,8 @@ The example was tested on versions: ### Clone this repo ```bash -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/3.test_cases/pytorch/verl/kubernetes/rlvr +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/examples/training/verl/kubernetes/rlvr ``` ### Install verl repository diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/img/ray-dashboard.png b/examples/training/verl/kubernetes/rlvr/img/ray-dashboard.png similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/img/ray-dashboard.png rename to examples/training/verl/kubernetes/rlvr/img/ray-dashboard.png diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/job-stop.sh b/examples/training/verl/kubernetes/rlvr/job-stop.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/job-stop.sh rename to examples/training/verl/kubernetes/rlvr/job-stop.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/observability/README.md b/examples/training/verl/kubernetes/rlvr/observability/README.md similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/observability/README.md rename to examples/training/verl/kubernetes/rlvr/observability/README.md diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/observability/add-ray-metrics.sh b/examples/training/verl/kubernetes/rlvr/observability/add-ray-metrics.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/observability/add-ray-metrics.sh rename to examples/training/verl/kubernetes/rlvr/observability/add-ray-metrics.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/ray-expose.sh b/examples/training/verl/kubernetes/rlvr/ray-expose.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/ray-expose.sh rename to examples/training/verl/kubernetes/rlvr/ray-expose.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/ray-hide.sh b/examples/training/verl/kubernetes/rlvr/ray-hide.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/ray-hide.sh rename to examples/training/verl/kubernetes/rlvr/ray-hide.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/recipe/run_dapo_configurable.sh b/examples/training/verl/kubernetes/rlvr/recipe/run_dapo_configurable.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/recipe/run_dapo_configurable.sh rename to examples/training/verl/kubernetes/rlvr/recipe/run_dapo_configurable.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/recipe/run_grpo_configurable.sh b/examples/training/verl/kubernetes/rlvr/recipe/run_grpo_configurable.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/recipe/run_grpo_configurable.sh rename to examples/training/verl/kubernetes/rlvr/recipe/run_grpo_configurable.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/recipe/run_qwen3-235b_megatron_96gb.sh b/examples/training/verl/kubernetes/rlvr/recipe/run_qwen3-235b_megatron_96gb.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/recipe/run_qwen3-235b_megatron_96gb.sh rename to examples/training/verl/kubernetes/rlvr/recipe/run_qwen3-235b_megatron_96gb.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/recipe/run_qwen3_vl-235b-megatron.sh b/examples/training/verl/kubernetes/rlvr/recipe/run_qwen3_vl-235b-megatron.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/recipe/run_qwen3_vl-235b-megatron.sh rename to examples/training/verl/kubernetes/rlvr/recipe/run_qwen3_vl-235b-megatron.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/setup/build-push.sh b/examples/training/verl/kubernetes/rlvr/setup/build-push.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/setup/build-push.sh rename to examples/training/verl/kubernetes/rlvr/setup/build-push.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/setup/download-model-job.sh b/examples/training/verl/kubernetes/rlvr/setup/download-model-job.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/setup/download-model-job.sh rename to examples/training/verl/kubernetes/rlvr/setup/download-model-job.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/setup/env_vars.example b/examples/training/verl/kubernetes/rlvr/setup/env_vars.example similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/setup/env_vars.example rename to examples/training/verl/kubernetes/rlvr/setup/env_vars.example diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/setup/generate-kustomization.sh b/examples/training/verl/kubernetes/rlvr/setup/generate-kustomization.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/setup/generate-kustomization.sh rename to examples/training/verl/kubernetes/rlvr/setup/generate-kustomization.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/setup/install-kuberay.sh b/examples/training/verl/kubernetes/rlvr/setup/install-kuberay.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/setup/install-kuberay.sh rename to examples/training/verl/kubernetes/rlvr/setup/install-kuberay.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/setup/load_data_dapo.sh b/examples/training/verl/kubernetes/rlvr/setup/load_data_dapo.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/setup/load_data_dapo.sh rename to examples/training/verl/kubernetes/rlvr/setup/load_data_dapo.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/setup/load_data_geo3k.sh b/examples/training/verl/kubernetes/rlvr/setup/load_data_geo3k.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/setup/load_data_geo3k.sh rename to examples/training/verl/kubernetes/rlvr/setup/load_data_geo3k.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/setup/load_data_grpo.sh b/examples/training/verl/kubernetes/rlvr/setup/load_data_grpo.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/setup/load_data_grpo.sh rename to examples/training/verl/kubernetes/rlvr/setup/load_data_grpo.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/setup/prepare-model-path.sh b/examples/training/verl/kubernetes/rlvr/setup/prepare-model-path.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/setup/prepare-model-path.sh rename to examples/training/verl/kubernetes/rlvr/setup/prepare-model-path.sh diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/setup/raycluster.yaml b/examples/training/verl/kubernetes/rlvr/setup/raycluster.yaml similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/setup/raycluster.yaml rename to examples/training/verl/kubernetes/rlvr/setup/raycluster.yaml diff --git a/3.test_cases/pytorch/verl/kubernetes/rlvr/setup/submit-qwen-job.sh b/examples/training/verl/kubernetes/rlvr/setup/submit-qwen-job.sh similarity index 100% rename from 3.test_cases/pytorch/verl/kubernetes/rlvr/setup/submit-qwen-job.sh rename to examples/training/verl/kubernetes/rlvr/setup/submit-qwen-job.sh diff --git a/3.test_cases/pytorch/ddp/detr-finetune/.gitignore b/examples/use-cases/detr-finetune/.gitignore similarity index 100% rename from 3.test_cases/pytorch/ddp/detr-finetune/.gitignore rename to examples/use-cases/detr-finetune/.gitignore diff --git a/3.test_cases/pytorch/ddp/detr-finetune/Dockerfile b/examples/use-cases/detr-finetune/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/ddp/detr-finetune/Dockerfile rename to examples/use-cases/detr-finetune/Dockerfile diff --git a/3.test_cases/pytorch/ddp/detr-finetune/README.md b/examples/use-cases/detr-finetune/README.md similarity index 98% rename from 3.test_cases/pytorch/ddp/detr-finetune/README.md rename to examples/use-cases/detr-finetune/README.md index 16d9dc39b..799a1fa73 100644 --- a/3.test_cases/pytorch/ddp/detr-finetune/README.md +++ b/examples/use-cases/detr-finetune/README.md @@ -45,7 +45,7 @@ across multiple GPU nodes connected with EFA networking. - An Amazon SageMaker HyperPod EKS cluster or Amazon EKS cluster with GPU nodes (e.g., `ml.g5.8xlarge`), accessible via `kubectl`. We recommend setting up the - cluster using the templates in [1.architectures](../../../../1.architectures). + cluster using the templates in [architectures](../../../architectures). - An Amazon FSx for Lustre persistent volume claim (default name: `fsx-pvc`; see [kubernetes/README.md](kubernetes/README.md) if your cluster uses a different PVC name). diff --git a/3.test_cases/pytorch/ddp/detr-finetune/data/README.md b/examples/use-cases/detr-finetune/data/README.md similarity index 100% rename from 3.test_cases/pytorch/ddp/detr-finetune/data/README.md rename to examples/use-cases/detr-finetune/data/README.md diff --git a/3.test_cases/pytorch/ddp/detr-finetune/detr_main.py b/examples/use-cases/detr-finetune/detr_main.py similarity index 100% rename from 3.test_cases/pytorch/ddp/detr-finetune/detr_main.py rename to examples/use-cases/detr-finetune/detr_main.py diff --git a/3.test_cases/pytorch/ddp/detr-finetune/kubernetes/README.md b/examples/use-cases/detr-finetune/kubernetes/README.md similarity index 98% rename from 3.test_cases/pytorch/ddp/detr-finetune/kubernetes/README.md rename to examples/use-cases/detr-finetune/kubernetes/README.md index 302b6e890..5e3b1a57c 100644 --- a/3.test_cases/pytorch/ddp/detr-finetune/kubernetes/README.md +++ b/examples/use-cases/detr-finetune/kubernetes/README.md @@ -20,7 +20,7 @@ The guide assumes that you have the following: - The dataset uploaded to FSx at `/fsx/data/` (see [data/README.md](../data/README.md)). We recommend that you setup a Kubernetes cluster using the templates in the -architectures [directory](../../../../../1.architectures). +architectures [directory](../../../../architectures). > **Note**: Amazon SageMaker HyperPod EKS clusters come with the Kubeflow > Training Operator pre-installed. If you are using a vanilla EKS cluster, diff --git a/3.test_cases/pytorch/ddp/detr-finetune/kubernetes/detr-resnet50-finetune.yaml-template b/examples/use-cases/detr-finetune/kubernetes/detr-resnet50-finetune.yaml-template similarity index 100% rename from 3.test_cases/pytorch/ddp/detr-finetune/kubernetes/detr-resnet50-finetune.yaml-template rename to examples/use-cases/detr-finetune/kubernetes/detr-resnet50-finetune.yaml-template diff --git a/3.test_cases/23.SMHP-esm2/0.download_data.py b/examples/use-cases/esm2-hyperpod/0.download_data.py similarity index 100% rename from 3.test_cases/23.SMHP-esm2/0.download_data.py rename to examples/use-cases/esm2-hyperpod/0.download_data.py diff --git a/3.test_cases/23.SMHP-esm2/1.tokenize_uniref_csv.py b/examples/use-cases/esm2-hyperpod/1.tokenize_uniref_csv.py similarity index 100% rename from 3.test_cases/23.SMHP-esm2/1.tokenize_uniref_csv.py rename to examples/use-cases/esm2-hyperpod/1.tokenize_uniref_csv.py diff --git a/3.test_cases/23.SMHP-esm2/2.train_ddp.sh b/examples/use-cases/esm2-hyperpod/2.train_ddp.sh similarity index 100% rename from 3.test_cases/23.SMHP-esm2/2.train_ddp.sh rename to examples/use-cases/esm2-hyperpod/2.train_ddp.sh diff --git a/3.test_cases/23.SMHP-esm2/3.train_fsdp.sh b/examples/use-cases/esm2-hyperpod/3.train_fsdp.sh similarity index 100% rename from 3.test_cases/23.SMHP-esm2/3.train_fsdp.sh rename to examples/use-cases/esm2-hyperpod/3.train_fsdp.sh diff --git a/3.test_cases/23.SMHP-esm2/4.train_docker_dpp.sh b/examples/use-cases/esm2-hyperpod/4.train_docker_dpp.sh similarity index 100% rename from 3.test_cases/23.SMHP-esm2/4.train_docker_dpp.sh rename to examples/use-cases/esm2-hyperpod/4.train_docker_dpp.sh diff --git a/3.test_cases/23.SMHP-esm2/Dockerfile b/examples/use-cases/esm2-hyperpod/Dockerfile similarity index 100% rename from 3.test_cases/23.SMHP-esm2/Dockerfile rename to examples/use-cases/esm2-hyperpod/Dockerfile diff --git a/3.test_cases/23.SMHP-esm2/README.md b/examples/use-cases/esm2-hyperpod/README.md similarity index 99% rename from 3.test_cases/23.SMHP-esm2/README.md rename to examples/use-cases/esm2-hyperpod/README.md index e11745cde..006d3e66c 100644 --- a/3.test_cases/23.SMHP-esm2/README.md +++ b/examples/use-cases/esm2-hyperpod/README.md @@ -10,7 +10,7 @@ ESM-2 is a powerful pLM. We will demonstrate how to use QLoRA to fine-tune ESM-2 on g5.24xlarge instances. We will use ESM-2 to predict [subcellular localization](https://academic.oup.com/nar/article/50/W1/W228/6576357?login=false). Understanding where proteins appear in cells can help us understand their role in disease and find new drug targets. ## 0. Prerequisites -You will need to set up a SageMaker Hyperpod cluster using 2 g5.24xlarge instances with a shared parallel filesystem such as [Amazon FSx for Lustre](https://docs.aws.amazon.com/fsx/latest/LustreGuide/getting-started.html). See the sagemaker-hyperpod section in the [Sagemaker Hyperpod](https://github.com/awslabs/awsome-distributed-training/tree/main/1.architectures/5.sagemaker-hyperpod) folder for setup instructions. +You will need to set up a SageMaker Hyperpod cluster using 2 g5.24xlarge instances with a shared parallel filesystem such as [Amazon FSx for Lustre](https://docs.aws.amazon.com/fsx/latest/LustreGuide/getting-started.html). See the sagemaker-hyperpod section in the [Sagemaker Hyperpod](https://github.com/awslabs/awsome-distributed-ai/tree/main/architectures/sagemaker-hyperpod-slurm) folder for setup instructions. ## 1. Install conda diff --git a/3.test_cases/23.SMHP-esm2/build.sh b/examples/use-cases/esm2-hyperpod/build.sh similarity index 100% rename from 3.test_cases/23.SMHP-esm2/build.sh rename to examples/use-cases/esm2-hyperpod/build.sh diff --git a/3.test_cases/23.SMHP-esm2/enroot.sh b/examples/use-cases/esm2-hyperpod/enroot.sh similarity index 100% rename from 3.test_cases/23.SMHP-esm2/enroot.sh rename to examples/use-cases/esm2-hyperpod/enroot.sh diff --git a/3.test_cases/23.SMHP-esm2/requirements.txt b/examples/use-cases/esm2-hyperpod/requirements.txt similarity index 100% rename from 3.test_cases/23.SMHP-esm2/requirements.txt rename to examples/use-cases/esm2-hyperpod/requirements.txt diff --git a/3.test_cases/23.SMHP-esm2/train.py b/examples/use-cases/esm2-hyperpod/train.py similarity index 100% rename from 3.test_cases/23.SMHP-esm2/train.py rename to examples/use-cases/esm2-hyperpod/train.py diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/.gitignore b/examples/use-cases/isaac-lab/.gitignore similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/.gitignore rename to examples/use-cases/isaac-lab/.gitignore diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/README.md b/examples/use-cases/isaac-lab/README.md similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/README.md rename to examples/use-cases/isaac-lab/README.md diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/config.yaml.example b/examples/use-cases/isaac-lab/config.yaml.example similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/config.yaml.example rename to examples/use-cases/isaac-lab/config.yaml.example diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/docker/Dockerfile b/examples/use-cases/isaac-lab/docker/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/docker/Dockerfile rename to examples/use-cases/isaac-lab/docker/Dockerfile diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/generate.py b/examples/use-cases/isaac-lab/generate.py similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/generate.py rename to examples/use-cases/isaac-lab/generate.py diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/scripts/mlflow_isaaclab.py b/examples/use-cases/isaac-lab/scripts/mlflow_isaaclab.py similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/scripts/mlflow_isaaclab.py rename to examples/use-cases/isaac-lab/scripts/mlflow_isaaclab.py diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/scripts/run_train.py b/examples/use-cases/isaac-lab/scripts/run_train.py similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/scripts/run_train.py rename to examples/use-cases/isaac-lab/scripts/run_train.py diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/scripts/sm-train-entrypoint.sh b/examples/use-cases/isaac-lab/scripts/sm-train-entrypoint.sh similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/scripts/sm-train-entrypoint.sh rename to examples/use-cases/isaac-lab/scripts/sm-train-entrypoint.sh diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/templates/launch-sm-training.py.tpl b/examples/use-cases/isaac-lab/templates/launch-sm-training.py.tpl similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/templates/launch-sm-training.py.tpl rename to examples/use-cases/isaac-lab/templates/launch-sm-training.py.tpl diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/templates/storage.yaml.tpl b/examples/use-cases/isaac-lab/templates/storage.yaml.tpl similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/templates/storage.yaml.tpl rename to examples/use-cases/isaac-lab/templates/storage.yaml.tpl diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/templates/tensorboard.yaml.tpl b/examples/use-cases/isaac-lab/templates/tensorboard.yaml.tpl similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/templates/tensorboard.yaml.tpl rename to examples/use-cases/isaac-lab/templates/tensorboard.yaml.tpl diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/templates/training-job.yaml.tpl b/examples/use-cases/isaac-lab/templates/training-job.yaml.tpl similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/templates/training-job.yaml.tpl rename to examples/use-cases/isaac-lab/templates/training-job.yaml.tpl diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/templates/viz-eks-webrtc-pod.yaml.tpl b/examples/use-cases/isaac-lab/templates/viz-eks-webrtc-pod.yaml.tpl similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/templates/viz-eks-webrtc-pod.yaml.tpl rename to examples/use-cases/isaac-lab/templates/viz-eks-webrtc-pod.yaml.tpl diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/viz-scripts/README.md b/examples/use-cases/isaac-lab/viz-scripts/README.md similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/viz-scripts/README.md rename to examples/use-cases/isaac-lab/viz-scripts/README.md diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/viz-scripts/run-h1-demo.sh b/examples/use-cases/isaac-lab/viz-scripts/run-h1-demo.sh similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/viz-scripts/run-h1-demo.sh rename to examples/use-cases/isaac-lab/viz-scripts/run-h1-demo.sh diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/viz-scripts/run-skrl-play.sh b/examples/use-cases/isaac-lab/viz-scripts/run-skrl-play.sh similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/viz-scripts/run-skrl-play.sh rename to examples/use-cases/isaac-lab/viz-scripts/run-skrl-play.sh diff --git a/3.test_cases/pytorch/nvidia-isaac-lab/viz-scripts/run-tensorboard.sh b/examples/use-cases/isaac-lab/viz-scripts/run-tensorboard.sh similarity index 100% rename from 3.test_cases/pytorch/nvidia-isaac-lab/viz-scripts/run-tensorboard.sh rename to examples/use-cases/isaac-lab/viz-scripts/run-tensorboard.sh diff --git a/3.test_cases/pytorch/distillation/Dockerfile b/examples/use-cases/llm-distillation/Dockerfile similarity index 100% rename from 3.test_cases/pytorch/distillation/Dockerfile rename to examples/use-cases/llm-distillation/Dockerfile diff --git a/3.test_cases/pytorch/distillation/README.md b/examples/use-cases/llm-distillation/README.md similarity index 98% rename from 3.test_cases/pytorch/distillation/README.md rename to examples/use-cases/llm-distillation/README.md index df4386426..5c64967d9 100644 --- a/3.test_cases/pytorch/distillation/README.md +++ b/examples/use-cases/llm-distillation/README.md @@ -4,7 +4,7 @@ This walkthrough demonstrates how to set up and run large language model (LLM) k ## Repository Structure ``` -3.test_cases/pytorch/distillation/ +examples/use-cases/llm-distillation/ ├── Dockerfile # Container definition for running distillation workloads ├── kubernetes/ │ └── distill.yaml # Kubernetes configuration for distributed training @@ -25,8 +25,8 @@ First, prepare your container image with all necessary dependencies: ```bash # Clone the repository cd ~ -git clone https://github.com/awslabs/awsome-distributed-training/ -cd awsome-distributed-training/3.test_cases/pytorch/distillation +git clone https://github.com/awslabs/awsome-distributed-ai/ +cd awsome-distributed-ai/examples/use-cases/llm-distillation # Set up environment variables aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/hpc-cloud diff --git a/3.test_cases/pytorch/distillation/kubernetes/distill.yaml-template b/examples/use-cases/llm-distillation/kubernetes/distill.yaml-template similarity index 100% rename from 3.test_cases/pytorch/distillation/kubernetes/distill.yaml-template rename to examples/use-cases/llm-distillation/kubernetes/distill.yaml-template diff --git a/3.test_cases/pytorch/distillation/src/distil_logits_cli.py b/examples/use-cases/llm-distillation/src/distil_logits_cli.py similarity index 100% rename from 3.test_cases/pytorch/distillation/src/distil_logits_cli.py rename to examples/use-cases/llm-distillation/src/distil_logits_cli.py diff --git a/3.test_cases/pytorch/distillation/src/requirements.txt b/examples/use-cases/llm-distillation/src/requirements.txt similarity index 100% rename from 3.test_cases/pytorch/distillation/src/requirements.txt rename to examples/use-cases/llm-distillation/src/requirements.txt diff --git a/3.test_cases/pytorch/nanoVLM/README.md b/examples/use-cases/nanovlm/README.md similarity index 98% rename from 3.test_cases/pytorch/nanoVLM/README.md rename to examples/use-cases/nanovlm/README.md index 46d9a7a0a..6b8df68c2 100644 --- a/3.test_cases/pytorch/nanoVLM/README.md +++ b/examples/use-cases/nanovlm/README.md @@ -17,8 +17,8 @@ Make sure that your current directory is under a shared filesystem such as `/fsx ```bash cd ~ - git clone https://github.com/aws-samples/awsome-distributed-training/ - cd awsome-distributed-training/3.test_cases/pytorch/nanoVLM/ + git clone https://github.com/aws-samples/awsome-distributed-ai/ + cd awsome-distributed-ai/examples/use-cases/nanovlm/ ``` diff --git a/3.test_cases/pytorch/nanoVLM/nanovlm.Dockerfile b/examples/use-cases/nanovlm/nanovlm.Dockerfile similarity index 100% rename from 3.test_cases/pytorch/nanoVLM/nanovlm.Dockerfile rename to examples/use-cases/nanovlm/nanovlm.Dockerfile diff --git a/3.test_cases/pytorch/nanoVLM/slurm/download_dataset.sbatch b/examples/use-cases/nanovlm/slurm/download_dataset.sbatch similarity index 100% rename from 3.test_cases/pytorch/nanoVLM/slurm/download_dataset.sbatch rename to examples/use-cases/nanovlm/slurm/download_dataset.sbatch diff --git a/3.test_cases/pytorch/nanoVLM/slurm/launch_evaluation.sbatch b/examples/use-cases/nanovlm/slurm/launch_evaluation.sbatch similarity index 100% rename from 3.test_cases/pytorch/nanoVLM/slurm/launch_evaluation.sbatch rename to examples/use-cases/nanovlm/slurm/launch_evaluation.sbatch diff --git a/3.test_cases/pytorch/nanoVLM/slurm/launch_training.sbatch b/examples/use-cases/nanovlm/slurm/launch_training.sbatch similarity index 100% rename from 3.test_cases/pytorch/nanoVLM/slurm/launch_training.sbatch rename to examples/use-cases/nanovlm/slurm/launch_training.sbatch diff --git a/3.test_cases/pytorch/vjepa2.1/.gitignore b/examples/use-cases/vjepa2.1/.gitignore similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/.gitignore rename to examples/use-cases/vjepa2.1/.gitignore diff --git a/3.test_cases/pytorch/vjepa2.1/README.md b/examples/use-cases/vjepa2.1/README.md similarity index 98% rename from 3.test_cases/pytorch/vjepa2.1/README.md rename to examples/use-cases/vjepa2.1/README.md index 342789eaa..cf60e3492 100644 --- a/3.test_cases/pytorch/vjepa2.1/README.md +++ b/examples/use-cases/vjepa2.1/README.md @@ -53,8 +53,8 @@ Several utility scripts (`generate_synthetic_dataset.py`, `nsys_wrapper.sh`, `pr ## 1. Clone this repository ```bash -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/3.test_cases/pytorch/vjepa2.1 +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/examples/use-cases/vjepa2.1 ``` ## 2. Datasets diff --git a/3.test_cases/pytorch/vjepa2.1/configs/benchmark-vitg-8nodes-optimized.yaml b/examples/use-cases/vjepa2.1/configs/benchmark-vitg-8nodes-optimized.yaml similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/configs/benchmark-vitg-8nodes-optimized.yaml rename to examples/use-cases/vjepa2.1/configs/benchmark-vitg-8nodes-optimized.yaml diff --git a/3.test_cases/pytorch/vjepa2.1/configs/benchmark-vitg-8nodes.yaml b/examples/use-cases/vjepa2.1/configs/benchmark-vitg-8nodes.yaml similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/configs/benchmark-vitg-8nodes.yaml rename to examples/use-cases/vjepa2.1/configs/benchmark-vitg-8nodes.yaml diff --git a/3.test_cases/pytorch/vjepa2.1/configs/pretrain-vitg-256px-16f.yaml b/examples/use-cases/vjepa2.1/configs/pretrain-vitg-256px-16f.yaml similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/configs/pretrain-vitg-256px-16f.yaml rename to examples/use-cases/vjepa2.1/configs/pretrain-vitg-256px-16f.yaml diff --git a/3.test_cases/pytorch/vjepa2.1/kubernetes/vjepa2-1-benchmark.yaml b/examples/use-cases/vjepa2.1/kubernetes/vjepa2-1-benchmark.yaml similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/kubernetes/vjepa2-1-benchmark.yaml rename to examples/use-cases/vjepa2.1/kubernetes/vjepa2-1-benchmark.yaml diff --git a/3.test_cases/pytorch/vjepa2.1/scripts/generate_synthetic_dataset.py b/examples/use-cases/vjepa2.1/scripts/generate_synthetic_dataset.py similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/scripts/generate_synthetic_dataset.py rename to examples/use-cases/vjepa2.1/scripts/generate_synthetic_dataset.py diff --git a/3.test_cases/pytorch/vjepa2.1/scripts/generate_synthetic_images.py b/examples/use-cases/vjepa2.1/scripts/generate_synthetic_images.py similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/scripts/generate_synthetic_images.py rename to examples/use-cases/vjepa2.1/scripts/generate_synthetic_images.py diff --git a/3.test_cases/pytorch/vjepa2.1/scripts/nsys_wrapper.sh b/examples/use-cases/vjepa2.1/scripts/nsys_wrapper.sh similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/scripts/nsys_wrapper.sh rename to examples/use-cases/vjepa2.1/scripts/nsys_wrapper.sh diff --git a/3.test_cases/pytorch/vjepa2.1/scripts/parse_benchmark.py b/examples/use-cases/vjepa2.1/scripts/parse_benchmark.py similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/scripts/parse_benchmark.py rename to examples/use-cases/vjepa2.1/scripts/parse_benchmark.py diff --git a/3.test_cases/pytorch/vjepa2.1/scripts/prepare_ssv2.py b/examples/use-cases/vjepa2.1/scripts/prepare_ssv2.py similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/scripts/prepare_ssv2.py rename to examples/use-cases/vjepa2.1/scripts/prepare_ssv2.py diff --git a/3.test_cases/pytorch/vjepa2.1/scripts/run_train.py b/examples/use-cases/vjepa2.1/scripts/run_train.py similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/scripts/run_train.py rename to examples/use-cases/vjepa2.1/scripts/run_train.py diff --git a/3.test_cases/pytorch/vjepa2.1/scripts/test_decord.py b/examples/use-cases/vjepa2.1/scripts/test_decord.py similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/scripts/test_decord.py rename to examples/use-cases/vjepa2.1/scripts/test_decord.py diff --git a/3.test_cases/pytorch/vjepa2.1/slurm/benchmark_training.sbatch b/examples/use-cases/vjepa2.1/slurm/benchmark_training.sbatch similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/slurm/benchmark_training.sbatch rename to examples/use-cases/vjepa2.1/slurm/benchmark_training.sbatch diff --git a/3.test_cases/pytorch/vjepa2.1/slurm/benchmark_training_b200.sbatch b/examples/use-cases/vjepa2.1/slurm/benchmark_training_b200.sbatch similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/slurm/benchmark_training_b200.sbatch rename to examples/use-cases/vjepa2.1/slurm/benchmark_training_b200.sbatch diff --git a/3.test_cases/pytorch/vjepa2.1/slurm/launch_training.sbatch b/examples/use-cases/vjepa2.1/slurm/launch_training.sbatch similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/slurm/launch_training.sbatch rename to examples/use-cases/vjepa2.1/slurm/launch_training.sbatch diff --git a/3.test_cases/pytorch/vjepa2.1/slurm/nsys_profile.sbatch b/examples/use-cases/vjepa2.1/slurm/nsys_profile.sbatch similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/slurm/nsys_profile.sbatch rename to examples/use-cases/vjepa2.1/slurm/nsys_profile.sbatch diff --git a/3.test_cases/pytorch/vjepa2.1/slurm/nsys_profile_b200.sbatch b/examples/use-cases/vjepa2.1/slurm/nsys_profile_b200.sbatch similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/slurm/nsys_profile_b200.sbatch rename to examples/use-cases/vjepa2.1/slurm/nsys_profile_b200.sbatch diff --git a/3.test_cases/pytorch/vjepa2.1/vjepa2_1.Dockerfile b/examples/use-cases/vjepa2.1/vjepa2_1.Dockerfile similarity index 100% rename from 3.test_cases/pytorch/vjepa2.1/vjepa2_1.Dockerfile rename to examples/use-cases/vjepa2.1/vjepa2_1.Dockerfile diff --git a/3.test_cases/pytorch/vjepa2/.gitignore b/examples/use-cases/vjepa2/.gitignore similarity index 100% rename from 3.test_cases/pytorch/vjepa2/.gitignore rename to examples/use-cases/vjepa2/.gitignore diff --git a/3.test_cases/pytorch/vjepa2/README.md b/examples/use-cases/vjepa2/README.md similarity index 99% rename from 3.test_cases/pytorch/vjepa2/README.md rename to examples/use-cases/vjepa2/README.md index 5bb0b08da..c3c4f7ead 100644 --- a/3.test_cases/pytorch/vjepa2/README.md +++ b/examples/use-cases/vjepa2/README.md @@ -36,8 +36,8 @@ We benchmark the **ViT-g/16 (1B parameters)** encoder variant using the **Someth ## 1. Clone this repository ```bash -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/3.test_cases/pytorch/vjepa2 +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/examples/use-cases/vjepa2 ``` ## 2. Dataset: Something-Something v2 (SSv2) diff --git a/3.test_cases/pytorch/vjepa2/configs/benchmark-vitg-8nodes-optimized.yaml b/examples/use-cases/vjepa2/configs/benchmark-vitg-8nodes-optimized.yaml similarity index 100% rename from 3.test_cases/pytorch/vjepa2/configs/benchmark-vitg-8nodes-optimized.yaml rename to examples/use-cases/vjepa2/configs/benchmark-vitg-8nodes-optimized.yaml diff --git a/3.test_cases/pytorch/vjepa2/configs/benchmark-vitg-8nodes.yaml b/examples/use-cases/vjepa2/configs/benchmark-vitg-8nodes.yaml similarity index 100% rename from 3.test_cases/pytorch/vjepa2/configs/benchmark-vitg-8nodes.yaml rename to examples/use-cases/vjepa2/configs/benchmark-vitg-8nodes.yaml diff --git a/3.test_cases/pytorch/vjepa2/configs/pretrain-vitg-256px-16f.yaml b/examples/use-cases/vjepa2/configs/pretrain-vitg-256px-16f.yaml similarity index 100% rename from 3.test_cases/pytorch/vjepa2/configs/pretrain-vitg-256px-16f.yaml rename to examples/use-cases/vjepa2/configs/pretrain-vitg-256px-16f.yaml diff --git a/3.test_cases/pytorch/vjepa2/kubernetes/vjepa2-benchmark.yaml b/examples/use-cases/vjepa2/kubernetes/vjepa2-benchmark.yaml similarity index 100% rename from 3.test_cases/pytorch/vjepa2/kubernetes/vjepa2-benchmark.yaml rename to examples/use-cases/vjepa2/kubernetes/vjepa2-benchmark.yaml diff --git a/3.test_cases/pytorch/vjepa2/scripts/generate_synthetic_dataset.py b/examples/use-cases/vjepa2/scripts/generate_synthetic_dataset.py similarity index 100% rename from 3.test_cases/pytorch/vjepa2/scripts/generate_synthetic_dataset.py rename to examples/use-cases/vjepa2/scripts/generate_synthetic_dataset.py diff --git a/3.test_cases/pytorch/vjepa2/scripts/nsys_wrapper.sh b/examples/use-cases/vjepa2/scripts/nsys_wrapper.sh similarity index 100% rename from 3.test_cases/pytorch/vjepa2/scripts/nsys_wrapper.sh rename to examples/use-cases/vjepa2/scripts/nsys_wrapper.sh diff --git a/3.test_cases/pytorch/vjepa2/scripts/parse_benchmark.py b/examples/use-cases/vjepa2/scripts/parse_benchmark.py similarity index 100% rename from 3.test_cases/pytorch/vjepa2/scripts/parse_benchmark.py rename to examples/use-cases/vjepa2/scripts/parse_benchmark.py diff --git a/3.test_cases/pytorch/vjepa2/scripts/prepare_ssv2.py b/examples/use-cases/vjepa2/scripts/prepare_ssv2.py similarity index 100% rename from 3.test_cases/pytorch/vjepa2/scripts/prepare_ssv2.py rename to examples/use-cases/vjepa2/scripts/prepare_ssv2.py diff --git a/3.test_cases/pytorch/vjepa2/scripts/run_train.py b/examples/use-cases/vjepa2/scripts/run_train.py similarity index 100% rename from 3.test_cases/pytorch/vjepa2/scripts/run_train.py rename to examples/use-cases/vjepa2/scripts/run_train.py diff --git a/3.test_cases/pytorch/vjepa2/scripts/test_decord.py b/examples/use-cases/vjepa2/scripts/test_decord.py similarity index 100% rename from 3.test_cases/pytorch/vjepa2/scripts/test_decord.py rename to examples/use-cases/vjepa2/scripts/test_decord.py diff --git a/3.test_cases/pytorch/vjepa2/slurm/benchmark_training.sbatch b/examples/use-cases/vjepa2/slurm/benchmark_training.sbatch similarity index 100% rename from 3.test_cases/pytorch/vjepa2/slurm/benchmark_training.sbatch rename to examples/use-cases/vjepa2/slurm/benchmark_training.sbatch diff --git a/3.test_cases/pytorch/vjepa2/slurm/benchmark_training_b200.sbatch b/examples/use-cases/vjepa2/slurm/benchmark_training_b200.sbatch similarity index 100% rename from 3.test_cases/pytorch/vjepa2/slurm/benchmark_training_b200.sbatch rename to examples/use-cases/vjepa2/slurm/benchmark_training_b200.sbatch diff --git a/3.test_cases/pytorch/vjepa2/slurm/benchmark_training_b200_optimized.sbatch b/examples/use-cases/vjepa2/slurm/benchmark_training_b200_optimized.sbatch similarity index 100% rename from 3.test_cases/pytorch/vjepa2/slurm/benchmark_training_b200_optimized.sbatch rename to examples/use-cases/vjepa2/slurm/benchmark_training_b200_optimized.sbatch diff --git a/3.test_cases/pytorch/vjepa2/slurm/download_dataset.sbatch b/examples/use-cases/vjepa2/slurm/download_dataset.sbatch similarity index 100% rename from 3.test_cases/pytorch/vjepa2/slurm/download_dataset.sbatch rename to examples/use-cases/vjepa2/slurm/download_dataset.sbatch diff --git a/3.test_cases/pytorch/vjepa2/slurm/launch_training.sbatch b/examples/use-cases/vjepa2/slurm/launch_training.sbatch similarity index 100% rename from 3.test_cases/pytorch/vjepa2/slurm/launch_training.sbatch rename to examples/use-cases/vjepa2/slurm/launch_training.sbatch diff --git a/3.test_cases/pytorch/vjepa2/slurm/nsys_profile.sbatch b/examples/use-cases/vjepa2/slurm/nsys_profile.sbatch similarity index 100% rename from 3.test_cases/pytorch/vjepa2/slurm/nsys_profile.sbatch rename to examples/use-cases/vjepa2/slurm/nsys_profile.sbatch diff --git a/3.test_cases/pytorch/vjepa2/slurm/nsys_profile_b200.sbatch b/examples/use-cases/vjepa2/slurm/nsys_profile_b200.sbatch similarity index 100% rename from 3.test_cases/pytorch/vjepa2/slurm/nsys_profile_b200.sbatch rename to examples/use-cases/vjepa2/slurm/nsys_profile_b200.sbatch diff --git a/3.test_cases/pytorch/vjepa2/vjepa2.Dockerfile b/examples/use-cases/vjepa2/vjepa2.Dockerfile similarity index 100% rename from 3.test_cases/pytorch/vjepa2/vjepa2.Dockerfile rename to examples/use-cases/vjepa2/vjepa2.Dockerfile diff --git a/micro-benchmarks/nccl-tests/README.md b/micro-benchmarks/nccl-tests/README.md index fe7dd6a31..65c803c23 100644 --- a/micro-benchmarks/nccl-tests/README.md +++ b/micro-benchmarks/nccl-tests/README.md @@ -12,7 +12,7 @@ If you are using Slurm, this guide assumes that you have the following: - Enroot requires libmd to compile and squashfs-tools to execute. - A shared directory mounted on `/apps` -It is recommended that you use the templates in the architectures [directory](../../1.architectures) +It is recommended that you use the templates in the architectures [directory](../../architectures) ### Amazon EKS If you are using EKS, this guide assumes that you have the following: diff --git a/micro-benchmarks/nccl-tests/aws-batch/README.md b/micro-benchmarks/nccl-tests/aws-batch/README.md index 3c67cc21e..8d8990195 100644 --- a/micro-benchmarks/nccl-tests/aws-batch/README.md +++ b/micro-benchmarks/nccl-tests/aws-batch/README.md @@ -7,8 +7,8 @@ 2. Next you can deploy the AWS Batch template included in this PR, where `cr-1234567890` is the id of your capacity block and `aws-batch-vpc` is the name of the vpc stack you created above. ```bash -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/1.architectures/3.aws-batch +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/architectures/aws-batch aws cloudformation create-stack --stack-name aws-batch-p5 \ --template-body file://0.aws-batch-distributed-training-p5.yaml \ --parameters ParameterKey=VPCStackParameter,ParameterValue="aws-batch-vpc" \ diff --git a/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/README.md b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/README.md index 9a6f47430..560dc7c17 100644 --- a/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/README.md +++ b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/README.md @@ -19,7 +19,7 @@ If you are using Slurm, this guide assumes that you have the following: - Enroot requires libmd to compile and squashfs-tools to execute. - A shared directory mounted on `/fsxl` -It is recommended that you use the templates in the architectures [directory](../../../../1.architectures) +It is recommended that you use the templates in the architectures [directory](../../../../architectures) ## 1. Prepare the container image and other artifacts @@ -97,8 +97,8 @@ The file will be stored in the `/fsxl` directory. ### Slurm with container -clone the awsome-distributed-training repo on your head node -`git clone https://github.com/awslabs/awsome-distributed-training.git` +clone the awsome-distributed-ai repo on your head node +`git clone https://github.com/awslabs/awsome-distributed-ai.git` Navigate to the topology-aware-nccl-tests directory: diff --git a/micro-benchmarks/nccom-tests/slurm/README.md b/micro-benchmarks/nccom-tests/slurm/README.md index 8166a8980..fa8012404 100644 --- a/micro-benchmarks/nccom-tests/slurm/README.md +++ b/micro-benchmarks/nccom-tests/slurm/README.md @@ -6,7 +6,7 @@ This guide assumes that you have the following: - A functional Slurm cluster on AWS. - `aws-neuronx-tools` installed on all the compute instances. -It is recommended that you use the templates in the architectures [directory](../../1.architectures) +It is recommended that you use the templates in the architectures [directory](../../architectures) ## Running NCCOM Tests diff --git a/4.validation_and_observability/3.efa-node-exporter/Dockerfile b/validation_and_observability/efa-node-exporter/Dockerfile similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/Dockerfile rename to validation_and_observability/efa-node-exporter/Dockerfile diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/Chart.yaml b/validation_and_observability/efa-node-exporter/EKS/Chart.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/Chart.yaml rename to validation_and_observability/efa-node-exporter/EKS/Chart.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/README.md b/validation_and_observability/efa-node-exporter/EKS/README.md similarity index 98% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/README.md rename to validation_and_observability/efa-node-exporter/EKS/README.md index de4d85a9a..eee5cd4aa 100644 --- a/4.validation_and_observability/3.efa-node-exporter/EKS/README.md +++ b/validation_and_observability/efa-node-exporter/EKS/README.md @@ -20,8 +20,8 @@ export LOCAL_PORT=9000 # Local port to curl prometheus metrics To build the Docker image: ```bash -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/4.validation_and_observability/3.efa-node-exporter/ +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/validation_and_observability/efa-node-exporter/ docker build -t ${REGISTRY}${IMAGE}${TAG} -f Dockerfile . ``` diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/ci/port-values.yaml b/validation_and_observability/efa-node-exporter/EKS/ci/port-values.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/ci/port-values.yaml rename to validation_and_observability/efa-node-exporter/EKS/ci/port-values.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/efa-exporter-values-temp.yaml b/validation_and_observability/efa-node-exporter/EKS/efa-exporter-values-temp.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/efa-exporter-values-temp.yaml rename to validation_and_observability/efa-node-exporter/EKS/efa-exporter-values-temp.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/NOTES.txt b/validation_and_observability/efa-node-exporter/EKS/templates/NOTES.txt similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/NOTES.txt rename to validation_and_observability/efa-node-exporter/EKS/templates/NOTES.txt diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/_helpers.tpl b/validation_and_observability/efa-node-exporter/EKS/templates/_helpers.tpl similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/_helpers.tpl rename to validation_and_observability/efa-node-exporter/EKS/templates/_helpers.tpl diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/clusterrole.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/clusterrole.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/clusterrole.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/clusterrole.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/clusterrolebinding.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/clusterrolebinding.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/clusterrolebinding.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/clusterrolebinding.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/daemonset.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/daemonset.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/daemonset.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/daemonset.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/endpoints.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/endpoints.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/endpoints.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/endpoints.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/extra-manifests.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/extra-manifests.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/extra-manifests.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/extra-manifests.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/networkpolicy.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/networkpolicy.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/networkpolicy.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/networkpolicy.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/podmonitor.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/podmonitor.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/podmonitor.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/podmonitor.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp-clusterrole.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/psp-clusterrole.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp-clusterrole.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/psp-clusterrole.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp-clusterrolebinding.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/psp-clusterrolebinding.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp-clusterrolebinding.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/psp-clusterrolebinding.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/psp.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/psp.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/rbac-configmap.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/rbac-configmap.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/rbac-configmap.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/rbac-configmap.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/service.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/service.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/service.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/service.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/serviceaccount.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/serviceaccount.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/serviceaccount.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/serviceaccount.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/servicemonitor.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/servicemonitor.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/servicemonitor.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/servicemonitor.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/verticalpodautoscaler.yaml b/validation_and_observability/efa-node-exporter/EKS/templates/verticalpodautoscaler.yaml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/EKS/templates/verticalpodautoscaler.yaml rename to validation_and_observability/efa-node-exporter/EKS/templates/verticalpodautoscaler.yaml diff --git a/4.validation_and_observability/3.efa-node-exporter/Makefile b/validation_and_observability/efa-node-exporter/Makefile similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/Makefile rename to validation_and_observability/efa-node-exporter/Makefile diff --git a/4.validation_and_observability/3.efa-node-exporter/README.md b/validation_and_observability/efa-node-exporter/README.md similarity index 95% rename from 4.validation_and_observability/3.efa-node-exporter/README.md rename to validation_and_observability/efa-node-exporter/README.md index 5f7c38089..6353d5a68 100644 --- a/4.validation_and_observability/3.efa-node-exporter/README.md +++ b/validation_and_observability/efa-node-exporter/README.md @@ -7,8 +7,8 @@ Scripted fork of the [Prometheus Node Exporter](https://github.com/prometheus/no To create the docker image run: ```bash -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/4.validation_and_observability/3.efa-node-exporter +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/validation_and_observability/efa-node-exporter make ``` diff --git a/4.validation_and_observability/3.efa-node-exporter/amazon_efa_linux.go b/validation_and_observability/efa-node-exporter/amazon_efa_linux.go similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/amazon_efa_linux.go rename to validation_and_observability/efa-node-exporter/amazon_efa_linux.go diff --git a/4.validation_and_observability/3.efa-node-exporter/buildspec.yaml b/validation_and_observability/efa-node-exporter/buildspec.yaml similarity index 87% rename from 4.validation_and_observability/3.efa-node-exporter/buildspec.yaml rename to validation_and_observability/efa-node-exporter/buildspec.yaml index 5792fd47f..bcf269349 100644 --- a/4.validation_and_observability/3.efa-node-exporter/buildspec.yaml +++ b/validation_and_observability/efa-node-exporter/buildspec.yaml @@ -20,7 +20,7 @@ phases: - export REPO_URI="$(aws ecr describe-repositories | grep repositoryUri | grep /${ECR_REPOSITORY_NAME}\" | cut -d '"' -f 4)" - echo "REPO_URI=$REPO_URI" - echo "Building ${REPO_URI}:${TAG} ..." - - cd 4.validation_and_observability/3.efa-node-exporter && docker image build --build-arg NODE_EXPORTER_VERSION=$NODE_EXPORTER_VERSION --build-arg PROCFS_EXPORTER_VERSION=$PROCFS_EXPORTER_VERSION -t ${REPO_URI}:${TAG} -f ./Dockerfile . + - cd validation_and_observability/efa-node-exporter && docker image build --build-arg NODE_EXPORTER_VERSION=$NODE_EXPORTER_VERSION --build-arg PROCFS_EXPORTER_VERSION=$PROCFS_EXPORTER_VERSION -t ${REPO_URI}:${TAG} -f ./Dockerfile . post_build: commands: - export ECR_URI=${REPO_URI%"/${ECR_REPOSITORY_NAME}"} diff --git a/4.validation_and_observability/3.efa-node-exporter/class_amazon_efa.go b/validation_and_observability/efa-node-exporter/class_amazon_efa.go similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/class_amazon_efa.go rename to validation_and_observability/efa-node-exporter/class_amazon_efa.go diff --git a/4.validation_and_observability/3.efa-node-exporter/docker-compose.yml b/validation_and_observability/efa-node-exporter/docker-compose.yml similarity index 100% rename from 4.validation_and_observability/3.efa-node-exporter/docker-compose.yml rename to validation_and_observability/efa-node-exporter/docker-compose.yml diff --git a/4.validation_and_observability/efa-versions.py b/validation_and_observability/efa-versions.py similarity index 100% rename from 4.validation_and_observability/efa-versions.py rename to validation_and_observability/efa-versions.py diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/README.md b/validation_and_observability/gpu-cluster-healthcheck/README.md similarity index 99% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/README.md rename to validation_and_observability/gpu-cluster-healthcheck/README.md index 76962e95e..fa5e6e9ff 100644 --- a/4.validation_and_observability/2.gpu-cluster-healthcheck/README.md +++ b/validation_and_observability/gpu-cluster-healthcheck/README.md @@ -21,8 +21,8 @@ The suite provides two operational modes: **lightweight checks** for regular use ```bash # Clone the repository -git clone https://github.com/awslabs/awsome-distributed-training.git -cd awsome-distributed-training/4.validation_and_observability/2.gpu-cluster-healthcheck +git clone https://github.com/awslabs/awsome-distributed-ai.git +cd awsome-distributed-ai/validation_and_observability/gpu-cluster-healthcheck # Make all scripts executable chmod +x gpu-healthcheck.sh checks/*.sh slurm/*.sh slurm/examples/*.sh @@ -49,7 +49,7 @@ cat /tmp/gpu-healthcheck-*/summary.json | python3 -m json.tool ### Directory Structure ``` -2.gpu-cluster-healthcheck/ +gpu-cluster-healthcheck/ ├── README.md # This documentation ├── gpu-healthcheck.sh # Master orchestrator ├── instance-profiles.conf # Per-instance-type hardware expectations diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/checks/0-nvidia-smi-check.sh b/validation_and_observability/gpu-cluster-healthcheck/checks/0-nvidia-smi-check.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/checks/0-nvidia-smi-check.sh rename to validation_and_observability/gpu-cluster-healthcheck/checks/0-nvidia-smi-check.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/checks/1-dcgm-diag-l2.sh b/validation_and_observability/gpu-cluster-healthcheck/checks/1-dcgm-diag-l2.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/checks/1-dcgm-diag-l2.sh rename to validation_and_observability/gpu-cluster-healthcheck/checks/1-dcgm-diag-l2.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/checks/2-efa-enumeration.sh b/validation_and_observability/gpu-cluster-healthcheck/checks/2-efa-enumeration.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/checks/2-efa-enumeration.sh rename to validation_and_observability/gpu-cluster-healthcheck/checks/2-efa-enumeration.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/checks/3-topology-check.sh b/validation_and_observability/gpu-cluster-healthcheck/checks/3-topology-check.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/checks/3-topology-check.sh rename to validation_and_observability/gpu-cluster-healthcheck/checks/3-topology-check.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/checks/4-dcgm-diag-l4.sh b/validation_and_observability/gpu-cluster-healthcheck/checks/4-dcgm-diag-l4.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/checks/4-dcgm-diag-l4.sh rename to validation_and_observability/gpu-cluster-healthcheck/checks/4-dcgm-diag-l4.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/checks/5-nccl-allreduce.sh b/validation_and_observability/gpu-cluster-healthcheck/checks/5-nccl-allreduce.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/checks/5-nccl-allreduce.sh rename to validation_and_observability/gpu-cluster-healthcheck/checks/5-nccl-allreduce.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/checks/6-efa-loopback.sh b/validation_and_observability/gpu-cluster-healthcheck/checks/6-efa-loopback.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/checks/6-efa-loopback.sh rename to validation_and_observability/gpu-cluster-healthcheck/checks/6-efa-loopback.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/gpu-healthcheck.sh b/validation_and_observability/gpu-cluster-healthcheck/gpu-healthcheck.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/gpu-healthcheck.sh rename to validation_and_observability/gpu-cluster-healthcheck/gpu-healthcheck.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/instance-profiles.conf b/validation_and_observability/gpu-cluster-healthcheck/instance-profiles.conf similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/instance-profiles.conf rename to validation_and_observability/gpu-cluster-healthcheck/instance-profiles.conf diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/Dockerfile b/validation_and_observability/gpu-cluster-healthcheck/kubernetes/Dockerfile similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/Dockerfile rename to validation_and_observability/gpu-cluster-healthcheck/kubernetes/Dockerfile diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/README.md b/validation_and_observability/gpu-cluster-healthcheck/kubernetes/README.md similarity index 99% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/README.md rename to validation_and_observability/gpu-cluster-healthcheck/kubernetes/README.md index 43619b68b..69cb1cf53 100644 --- a/4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/README.md +++ b/validation_and_observability/gpu-cluster-healthcheck/kubernetes/README.md @@ -52,7 +52,7 @@ The Kubernetes deployment consists of three components: ### 1. Build the Container Image ```bash -cd 4.validation_and_observability/2.gpu-cluster-healthcheck +cd validation_and_observability/gpu-cluster-healthcheck # Build docker build -f kubernetes/Dockerfile -t gpu-healthcheck:latest . diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/agent.sh b/validation_and_observability/gpu-cluster-healthcheck/kubernetes/agent.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/agent.sh rename to validation_and_observability/gpu-cluster-healthcheck/kubernetes/agent.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/determine-severity.py b/validation_and_observability/gpu-cluster-healthcheck/kubernetes/determine-severity.py similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/determine-severity.py rename to validation_and_observability/gpu-cluster-healthcheck/kubernetes/determine-severity.py diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/manifests/00-namespace.yaml b/validation_and_observability/gpu-cluster-healthcheck/kubernetes/manifests/00-namespace.yaml similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/manifests/00-namespace.yaml rename to validation_and_observability/gpu-cluster-healthcheck/kubernetes/manifests/00-namespace.yaml diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/manifests/01-configmap.yaml b/validation_and_observability/gpu-cluster-healthcheck/kubernetes/manifests/01-configmap.yaml similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/manifests/01-configmap.yaml rename to validation_and_observability/gpu-cluster-healthcheck/kubernetes/manifests/01-configmap.yaml diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/manifests/02-rbac.yaml b/validation_and_observability/gpu-cluster-healthcheck/kubernetes/manifests/02-rbac.yaml similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/manifests/02-rbac.yaml rename to validation_and_observability/gpu-cluster-healthcheck/kubernetes/manifests/02-rbac.yaml diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/manifests/03-daemonset-agent.yaml b/validation_and_observability/gpu-cluster-healthcheck/kubernetes/manifests/03-daemonset-agent.yaml similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/manifests/03-daemonset-agent.yaml rename to validation_and_observability/gpu-cluster-healthcheck/kubernetes/manifests/03-daemonset-agent.yaml diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/manifests/04-cronjob-sweeper.yaml b/validation_and_observability/gpu-cluster-healthcheck/kubernetes/manifests/04-cronjob-sweeper.yaml similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/manifests/04-cronjob-sweeper.yaml rename to validation_and_observability/gpu-cluster-healthcheck/kubernetes/manifests/04-cronjob-sweeper.yaml diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/manifests/05-job-quarantine.yaml b/validation_and_observability/gpu-cluster-healthcheck/kubernetes/manifests/05-job-quarantine.yaml similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/manifests/05-job-quarantine.yaml rename to validation_and_observability/gpu-cluster-healthcheck/kubernetes/manifests/05-job-quarantine.yaml diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/sweeper.sh b/validation_and_observability/gpu-cluster-healthcheck/kubernetes/sweeper.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/kubernetes/sweeper.sh rename to validation_and_observability/gpu-cluster-healthcheck/kubernetes/sweeper.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/lib/aggregate-results.py b/validation_and_observability/gpu-cluster-healthcheck/lib/aggregate-results.py similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/lib/aggregate-results.py rename to validation_and_observability/gpu-cluster-healthcheck/lib/aggregate-results.py diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/lib/common.sh b/validation_and_observability/gpu-cluster-healthcheck/lib/common.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/lib/common.sh rename to validation_and_observability/gpu-cluster-healthcheck/lib/common.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/lib/parse-dcgm-results.py b/validation_and_observability/gpu-cluster-healthcheck/lib/parse-dcgm-results.py similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/lib/parse-dcgm-results.py rename to validation_and_observability/gpu-cluster-healthcheck/lib/parse-dcgm-results.py diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/README.md b/validation_and_observability/gpu-cluster-healthcheck/slurm/README.md similarity index 97% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/README.md rename to validation_and_observability/gpu-cluster-healthcheck/slurm/README.md index c28abffe7..15eaff0de 100644 --- a/4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/README.md +++ b/validation_and_observability/gpu-cluster-healthcheck/slurm/README.md @@ -28,7 +28,7 @@ Slurm-native integration for the GPU cluster health check suite, including prolo Add to `slurm.conf`: ```conf -Prolog=/path/to/2.gpu-cluster-healthcheck/slurm/prolog-gpu-healthcheck.sh +Prolog=/path/to/gpu-cluster-healthcheck/slurm/prolog-gpu-healthcheck.sh PrologTimeout=900 # 15 minutes ``` diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/examples/cron-rolling-sweep.sh b/validation_and_observability/gpu-cluster-healthcheck/slurm/examples/cron-rolling-sweep.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/examples/cron-rolling-sweep.sh rename to validation_and_observability/gpu-cluster-healthcheck/slurm/examples/cron-rolling-sweep.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/examples/slurm-epilog-example.sh b/validation_and_observability/gpu-cluster-healthcheck/slurm/examples/slurm-epilog-example.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/examples/slurm-epilog-example.sh rename to validation_and_observability/gpu-cluster-healthcheck/slurm/examples/slurm-epilog-example.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/prolog-gpu-healthcheck.sh b/validation_and_observability/gpu-cluster-healthcheck/slurm/prolog-gpu-healthcheck.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/prolog-gpu-healthcheck.sh rename to validation_and_observability/gpu-cluster-healthcheck/slurm/prolog-gpu-healthcheck.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/sbatch-intensive.sh b/validation_and_observability/gpu-cluster-healthcheck/slurm/sbatch-intensive.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/sbatch-intensive.sh rename to validation_and_observability/gpu-cluster-healthcheck/slurm/sbatch-intensive.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/sbatch-lightweight.sh b/validation_and_observability/gpu-cluster-healthcheck/slurm/sbatch-lightweight.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/sbatch-lightweight.sh rename to validation_and_observability/gpu-cluster-healthcheck/slurm/sbatch-lightweight.sh diff --git a/4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/sbatch-quarantine-workflow.sh b/validation_and_observability/gpu-cluster-healthcheck/slurm/sbatch-quarantine-workflow.sh similarity index 100% rename from 4.validation_and_observability/2.gpu-cluster-healthcheck/slurm/sbatch-quarantine-workflow.sh rename to validation_and_observability/gpu-cluster-healthcheck/slurm/sbatch-quarantine-workflow.sh diff --git a/4.validation_and_observability/5.nsight/2.generate_recipes.sh b/validation_and_observability/nsight/2.generate_recipes.sh similarity index 100% rename from 4.validation_and_observability/5.nsight/2.generate_recipes.sh rename to validation_and_observability/nsight/2.generate_recipes.sh diff --git a/4.validation_and_observability/5.nsight/EKS/Dockerfile.llama2-efa b/validation_and_observability/nsight/EKS/Dockerfile.llama2-efa similarity index 100% rename from 4.validation_and_observability/5.nsight/EKS/Dockerfile.llama2-efa rename to validation_and_observability/nsight/EKS/Dockerfile.llama2-efa diff --git a/4.validation_and_observability/5.nsight/EKS/custom_values.yaml b/validation_and_observability/nsight/EKS/custom_values.yaml similarity index 100% rename from 4.validation_and_observability/5.nsight/EKS/custom_values.yaml rename to validation_and_observability/nsight/EKS/custom_values.yaml diff --git a/4.validation_and_observability/5.nsight/EKS/fsdp.yaml b/validation_and_observability/nsight/EKS/fsdp.yaml similarity index 100% rename from 4.validation_and_observability/5.nsight/EKS/fsdp.yaml rename to validation_and_observability/nsight/EKS/fsdp.yaml diff --git a/4.validation_and_observability/5.nsight/EKS/fsdp_eks_report_screenshot.png b/validation_and_observability/nsight/EKS/fsdp_eks_report_screenshot.png similarity index 100% rename from 4.validation_and_observability/5.nsight/EKS/fsdp_eks_report_screenshot.png rename to validation_and_observability/nsight/EKS/fsdp_eks_report_screenshot.png diff --git a/4.validation_and_observability/5.nsight/EKS/install-injector b/validation_and_observability/nsight/EKS/install-injector similarity index 100% rename from 4.validation_and_observability/5.nsight/EKS/install-injector rename to validation_and_observability/nsight/EKS/install-injector diff --git a/4.validation_and_observability/5.nsight/EKS/label-namespace b/validation_and_observability/nsight/EKS/label-namespace similarity index 100% rename from 4.validation_and_observability/5.nsight/EKS/label-namespace rename to validation_and_observability/nsight/EKS/label-namespace diff --git a/4.validation_and_observability/5.nsight/EKS/llama3_2_1b-fsdp-nsight.yaml b/validation_and_observability/nsight/EKS/llama3_2_1b-fsdp-nsight.yaml similarity index 100% rename from 4.validation_and_observability/5.nsight/EKS/llama3_2_1b-fsdp-nsight.yaml rename to validation_and_observability/nsight/EKS/llama3_2_1b-fsdp-nsight.yaml diff --git a/4.validation_and_observability/5.nsight/EKS/move_report b/validation_and_observability/nsight/EKS/move_report similarity index 100% rename from 4.validation_and_observability/5.nsight/EKS/move_report rename to validation_and_observability/nsight/EKS/move_report diff --git a/4.validation_and_observability/5.nsight/EKS/nsys-profile.sh b/validation_and_observability/nsight/EKS/nsys-profile.sh similarity index 100% rename from 4.validation_and_observability/5.nsight/EKS/nsys-profile.sh rename to validation_and_observability/nsight/EKS/nsys-profile.sh diff --git a/4.validation_and_observability/5.nsight/EKS/nsys_analyze.py b/validation_and_observability/nsight/EKS/nsys_analyze.py similarity index 100% rename from 4.validation_and_observability/5.nsight/EKS/nsys_analyze.py rename to validation_and_observability/nsight/EKS/nsys_analyze.py diff --git a/4.validation_and_observability/5.nsight/EKS/uniinstall-injector b/validation_and_observability/nsight/EKS/uniinstall-injector similarity index 100% rename from 4.validation_and_observability/5.nsight/EKS/uniinstall-injector rename to validation_and_observability/nsight/EKS/uniinstall-injector diff --git a/4.validation_and_observability/5.nsight/README.md b/validation_and_observability/nsight/README.md similarity index 97% rename from 4.validation_and_observability/5.nsight/README.md rename to validation_and_observability/nsight/README.md index 876fab801..6e8e5b4f0 100644 --- a/4.validation_and_observability/5.nsight/README.md +++ b/validation_and_observability/nsight/README.md @@ -4,9 +4,9 @@ We will show how to profile and analyze: -1. [NCCL Tests](https://github.com/awslabs/awsome-distributed-training/tree/main/micro-benchmarks/nccl-tests/slurm) -2. [Distributed training run with NeMo](https://github.com/awslabs/awsome-distributed-training/tree/main/3.test_cases/2.nemo-launcher) -3. [Distributed training run with FSDP](https://github.com/awslabs/awsome-distributed-training/tree/main/3.test_cases/10.FSDP) +1. [NCCL Tests](https://github.com/awslabs/awsome-distributed-ai/tree/main/micro-benchmarks/nccl-tests/slurm) +2. [Distributed training run with NeMo](https://github.com/awslabs/awsome-distributed-ai/tree/main/examples/2.nemo-launcher) +3. [Distributed training run with FSDP](https://github.com/awslabs/awsome-distributed-ai/tree/main/examples/10.FSDP) 4. Setup Nsight on an EKS cluster # 0. Prerequisities @@ -144,7 +144,7 @@ if batch_idx == args.nsys_end_step and global_rank == 0: # 4. Profiling NCCL tests -In this section we will show how to generate Nsight reports for NCCL tests. Follow the instructions [here](https://github.com/awslabs/awsome-distributed-training/tree/main/4.validation_and_observability/0.nccl-tests) to setup NCCL tests and generate the Enroot image `nccl.sqsh`. The `0.nsight_nccl.sbatch` script shows an example on how to profile the NCCL run with Nsight and collect EFA metrics. Key differences between `0.nsight_nccl.sbatch` and [this](https://github.com/awslabs/awsome-distributed-training/blob/main/4.validation_and_observability/0.nccl-tests/1.nccl-tests.sbatch) are: +In this section we will show how to generate Nsight reports for NCCL tests. Follow the instructions [here](https://github.com/awslabs/awsome-distributed-ai/tree/main/validation_and_observability/0.nccl-tests) to setup NCCL tests and generate the Enroot image `nccl.sqsh`. The `0.nsight_nccl.sbatch` script shows an example on how to profile the NCCL run with Nsight and collect EFA metrics. Key differences between `0.nsight_nccl.sbatch` and [this](https://github.com/awslabs/awsome-distributed-ai/blob/main/validation_and_observability/0.nccl-tests/1.nccl-tests.sbatch) are: 1. `/fsx` needs to be mounted to the container as this is where our Nsight binaries are located. 2. The `0.nsight_nccl.sbatch` script references the executable `nsys-slurm-exec` which is given below and should exist in `/fsx` diff --git a/4.validation_and_observability/5.nsight/fsdp-llama2/1.distributed-training.sbatch b/validation_and_observability/nsight/fsdp-llama2/1.distributed-training.sbatch similarity index 100% rename from 4.validation_and_observability/5.nsight/fsdp-llama2/1.distributed-training.sbatch rename to validation_and_observability/nsight/fsdp-llama2/1.distributed-training.sbatch diff --git a/4.validation_and_observability/5.nsight/fsdp-llama2/fsdp_rep_screenshot.png b/validation_and_observability/nsight/fsdp-llama2/fsdp_rep_screenshot.png similarity index 100% rename from 4.validation_and_observability/5.nsight/fsdp-llama2/fsdp_rep_screenshot.png rename to validation_and_observability/nsight/fsdp-llama2/fsdp_rep_screenshot.png diff --git a/4.validation_and_observability/5.nsight/fsdp-llama2/nsys-slurm-exec b/validation_and_observability/nsight/fsdp-llama2/nsys-slurm-exec similarity index 100% rename from 4.validation_and_observability/5.nsight/fsdp-llama2/nsys-slurm-exec rename to validation_and_observability/nsight/fsdp-llama2/nsys-slurm-exec diff --git a/4.validation_and_observability/5.nsight/fsdp-llama2/train.py b/validation_and_observability/nsight/fsdp-llama2/train.py similarity index 100% rename from 4.validation_and_observability/5.nsight/fsdp-llama2/train.py rename to validation_and_observability/nsight/fsdp-llama2/train.py diff --git a/4.validation_and_observability/5.nsight/nccl/0.nsight_nccl.sbatch b/validation_and_observability/nsight/nccl/0.nsight_nccl.sbatch similarity index 100% rename from 4.validation_and_observability/5.nsight/nccl/0.nsight_nccl.sbatch rename to validation_and_observability/nsight/nccl/0.nsight_nccl.sbatch diff --git a/4.validation_and_observability/5.nsight/nccl/NCCL_Scatter_Perf.png b/validation_and_observability/nsight/nccl/NCCL_Scatter_Perf.png similarity index 100% rename from 4.validation_and_observability/5.nsight/nccl/NCCL_Scatter_Perf.png rename to validation_and_observability/nsight/nccl/NCCL_Scatter_Perf.png diff --git a/4.validation_and_observability/5.nsight/nccl/all_reduce_csv_screenshot.png b/validation_and_observability/nsight/nccl/all_reduce_csv_screenshot.png similarity index 100% rename from 4.validation_and_observability/5.nsight/nccl/all_reduce_csv_screenshot.png rename to validation_and_observability/nsight/nccl/all_reduce_csv_screenshot.png diff --git a/4.validation_and_observability/5.nsight/nccl/all_reduce_sum.png b/validation_and_observability/nsight/nccl/all_reduce_sum.png similarity index 100% rename from 4.validation_and_observability/5.nsight/nccl/all_reduce_sum.png rename to validation_and_observability/nsight/nccl/all_reduce_sum.png diff --git a/4.validation_and_observability/5.nsight/nccl/plot_nccl.py b/validation_and_observability/nsight/nccl/plot_nccl.py similarity index 100% rename from 4.validation_and_observability/5.nsight/nccl/plot_nccl.py rename to validation_and_observability/nsight/nccl/plot_nccl.py diff --git a/4.validation_and_observability/5.nsight/nemotron/1.nemotron.sbatch b/validation_and_observability/nsight/nemotron/1.nemotron.sbatch similarity index 100% rename from 4.validation_and_observability/5.nsight/nemotron/1.nemotron.sbatch rename to validation_and_observability/nsight/nemotron/1.nemotron.sbatch diff --git a/4.validation_and_observability/5.nsight/nemotron/nemo.Dockerfile b/validation_and_observability/nsight/nemotron/nemo.Dockerfile similarity index 100% rename from 4.validation_and_observability/5.nsight/nemotron/nemo.Dockerfile rename to validation_and_observability/nsight/nemotron/nemo.Dockerfile diff --git a/4.validation_and_observability/5.nsight/nemotron/nemotron-15B-P5-report.png b/validation_and_observability/nsight/nemotron/nemotron-15B-P5-report.png similarity index 100% rename from 4.validation_and_observability/5.nsight/nemotron/nemotron-15B-P5-report.png rename to validation_and_observability/nsight/nemotron/nemotron-15B-P5-report.png diff --git a/4.validation_and_observability/5.nsight/nemotron/nemotron-slurm-exec.sh b/validation_and_observability/nsight/nemotron/nemotron-slurm-exec.sh similarity index 100% rename from 4.validation_and_observability/5.nsight/nemotron/nemotron-slurm-exec.sh rename to validation_and_observability/nsight/nemotron/nemotron-slurm-exec.sh diff --git a/4.validation_and_observability/5.nsight/slurm-workshop-artifacts/get_nccl_msg_size.py b/validation_and_observability/nsight/slurm-workshop-artifacts/get_nccl_msg_size.py similarity index 100% rename from 4.validation_and_observability/5.nsight/slurm-workshop-artifacts/get_nccl_msg_size.py rename to validation_and_observability/nsight/slurm-workshop-artifacts/get_nccl_msg_size.py diff --git a/4.validation_and_observability/5.nsight/slurm-workshop-artifacts/install_nsight.sh b/validation_and_observability/nsight/slurm-workshop-artifacts/install_nsight.sh similarity index 100% rename from 4.validation_and_observability/5.nsight/slurm-workshop-artifacts/install_nsight.sh rename to validation_and_observability/nsight/slurm-workshop-artifacts/install_nsight.sh diff --git a/4.validation_and_observability/5.nsight/slurm-workshop-artifacts/nccl-slurm-exec-delay-duration.sh b/validation_and_observability/nsight/slurm-workshop-artifacts/nccl-slurm-exec-delay-duration.sh similarity index 100% rename from 4.validation_and_observability/5.nsight/slurm-workshop-artifacts/nccl-slurm-exec-delay-duration.sh rename to validation_and_observability/nsight/slurm-workshop-artifacts/nccl-slurm-exec-delay-duration.sh diff --git a/4.validation_and_observability/5.nsight/slurm-workshop-artifacts/nccl-slurm-exec-steps.sh b/validation_and_observability/nsight/slurm-workshop-artifacts/nccl-slurm-exec-steps.sh similarity index 100% rename from 4.validation_and_observability/5.nsight/slurm-workshop-artifacts/nccl-slurm-exec-steps.sh rename to validation_and_observability/nsight/slurm-workshop-artifacts/nccl-slurm-exec-steps.sh diff --git a/4.validation_and_observability/5.nsight/slurm-workshop-artifacts/nccl-slurm-exec.sh b/validation_and_observability/nsight/slurm-workshop-artifacts/nccl-slurm-exec.sh similarity index 100% rename from 4.validation_and_observability/5.nsight/slurm-workshop-artifacts/nccl-slurm-exec.sh rename to validation_and_observability/nsight/slurm-workshop-artifacts/nccl-slurm-exec.sh diff --git a/4.validation_and_observability/5.nsight/slurm-workshop-artifacts/requirements.txt b/validation_and_observability/nsight/slurm-workshop-artifacts/requirements.txt similarity index 100% rename from 4.validation_and_observability/5.nsight/slurm-workshop-artifacts/requirements.txt rename to validation_and_observability/nsight/slurm-workshop-artifacts/requirements.txt diff --git a/4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/README.md b/validation_and_observability/prometheus-grafana/1click-dashboards-deployment/README.md similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/README.md rename to validation_and_observability/prometheus-grafana/1click-dashboards-deployment/README.md diff --git a/4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/cluster-observability.yaml b/validation_and_observability/prometheus-grafana/1click-dashboards-deployment/cluster-observability.yaml similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/cluster-observability.yaml rename to validation_and_observability/prometheus-grafana/1click-dashboards-deployment/cluster-observability.yaml diff --git a/4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/dashboards/.gitignore b/validation_and_observability/prometheus-grafana/1click-dashboards-deployment/dashboards/.gitignore similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/dashboards/.gitignore rename to validation_and_observability/prometheus-grafana/1click-dashboards-deployment/dashboards/.gitignore diff --git a/4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/dashboards/DCGM_exporter_dashboard.json b/validation_and_observability/prometheus-grafana/1click-dashboards-deployment/dashboards/DCGM_exporter_dashboard.json similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/dashboards/DCGM_exporter_dashboard.json rename to validation_and_observability/prometheus-grafana/1click-dashboards-deployment/dashboards/DCGM_exporter_dashboard.json diff --git a/4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/dashboards/create_ml_dashboards.py b/validation_and_observability/prometheus-grafana/1click-dashboards-deployment/dashboards/create_ml_dashboards.py similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/dashboards/create_ml_dashboards.py rename to validation_and_observability/prometheus-grafana/1click-dashboards-deployment/dashboards/create_ml_dashboards.py diff --git a/4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/dashboards/requirements.txt b/validation_and_observability/prometheus-grafana/1click-dashboards-deployment/dashboards/requirements.txt similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/dashboards/requirements.txt rename to validation_and_observability/prometheus-grafana/1click-dashboards-deployment/dashboards/requirements.txt diff --git a/4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/img/observability-dashboard.png b/validation_and_observability/prometheus-grafana/1click-dashboards-deployment/img/observability-dashboard.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/img/observability-dashboard.png rename to validation_and_observability/prometheus-grafana/1click-dashboards-deployment/img/observability-dashboard.png diff --git a/4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/managed-cluster-observability-pc.yaml b/validation_and_observability/prometheus-grafana/1click-dashboards-deployment/managed-cluster-observability-pc.yaml similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/managed-cluster-observability-pc.yaml rename to validation_and_observability/prometheus-grafana/1click-dashboards-deployment/managed-cluster-observability-pc.yaml diff --git a/4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/prometheus-agent-collector.yaml b/validation_and_observability/prometheus-grafana/1click-dashboards-deployment/prometheus-agent-collector.yaml similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment/prometheus-agent-collector.yaml rename to validation_and_observability/prometheus-grafana/1click-dashboards-deployment/prometheus-agent-collector.yaml diff --git a/4.validation_and_observability/4.prometheus-grafana/README-OS-grafana.md b/validation_and_observability/prometheus-grafana/README-OS-grafana.md similarity index 95% rename from 4.validation_and_observability/4.prometheus-grafana/README-OS-grafana.md rename to validation_and_observability/prometheus-grafana/README-OS-grafana.md index 4e807b4f6..371af6f5d 100644 --- a/4.validation_and_observability/4.prometheus-grafana/README-OS-grafana.md +++ b/validation_and_observability/prometheus-grafana/README-OS-grafana.md @@ -18,7 +18,7 @@ To get started, you will initiate the provisioning of an Amazon CloudFormation S ### Deploy the CloudFormation Stack -[
 1-Click Deploy 🚀 
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-training.s3.amazonaws.com/templates/cluster-observability-os-grafana.yaml&stackName=Cluster-Observability-OS-Grafana) +[
 1-Click Deploy 🚀 
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-ai.s3.amazonaws.com/templates/cluster-observability-os-grafana.yaml&stackName=Cluster-Observability-OS-Grafana) >[!IMPORTANT] > It is strongly recommended you deploy this stack into the same region and same account as your SageMaker HyperPod Cluster.This will ensure successful execution of the Lifecycle Scripts, specifically `install_prometheus.sh`, which relies on AWS CLI commands that assume same account and same region. @@ -125,7 +125,7 @@ Scheduling: CustomActions: OnNodeConfigured: Sequence: - - Script: https://raw.githubusercontent.com/awslabs/awsome-distributed-training/main/1.architectures/2.aws-parallelcluster/post-install-scripts/install-node-exporter.sh + - Script: https://raw.githubusercontent.com/awslabs/awsome-distributed-ai/main/architectures/aws-parallelcluster/post-install-scripts/install-node-exporter.sh Networking: AdditionalSecurityGroups: - # Retrieved from previous step diff --git a/4.validation_and_observability/4.prometheus-grafana/README-grafana-alerts.md b/validation_and_observability/prometheus-grafana/README-grafana-alerts.md similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/README-grafana-alerts.md rename to validation_and_observability/prometheus-grafana/README-grafana-alerts.md diff --git a/4.validation_and_observability/4.prometheus-grafana/README.md b/validation_and_observability/prometheus-grafana/README.md similarity index 87% rename from 4.validation_and_observability/4.prometheus-grafana/README.md rename to validation_and_observability/prometheus-grafana/README.md index 4396d4734..2a0d9eea3 100644 --- a/4.validation_and_observability/4.prometheus-grafana/README.md +++ b/validation_and_observability/prometheus-grafana/README.md @@ -32,16 +32,16 @@ If you are using an environment which does not allow to use IAM Identity Center ![observability_architecture](./assets/observability_architecture.png) -The solution uses SageMaker HyperPod [Lifecycle Scripts](https://github.com/awslabs/awsome-distributed-training/tree/main/1.architectures/5.sagemaker-hyperpod#31-lifecycle-scripts), to bootstrap your cluster with the following open-source exporter services: +The solution uses SageMaker HyperPod [Lifecycle Scripts](https://github.com/awslabs/awsome-distributed-ai/tree/main/architectures/sagemaker-hyperpod-slurm#31-lifecycle-scripts), to bootstrap your cluster with the following open-source exporter services: | Name | Script Deployment Target | Metrics Description | | ------------------------------------------------------------------ | -------- | --------------------------------------------------- | | [`0.Prometheus Slurm Exporter`](https://github.com/SckyzO/slurm_exporter) | controller-node | SLURM Accounting metrics (sinfo, sacct) | -| [`1.EFA-Node-Exporter`](https://github.com/awslabs/awsome-distributed-training/tree/main/4.validation_and_observability/3.efa-node-exporter) | cluster-nodes | Fork of Node exporter to include metrics from emitted from EFA | +| [`1.EFA-Node-Exporter`](https://github.com/awslabs/awsome-distributed-ai/tree/main/validation_and_observability/efa-node-exporter) | cluster-nodes | Fork of Node exporter to include metrics from emitted from EFA | | [`2.NVIDIA-DCGM-Exporter`](https://github.com/NVIDIA/dcgm-exporter) | cluster-nodes | Nvidia DCGM Metrics about Nvidia Enabled GPUs | ### Prerequisites -To enable these exporter services, modify the [config.py](https://github.com/awslabs/awsome-distributed-training/blob/main/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/config.py) file to configure `enable_observability = True`. Save this file, and [upload it to the s3 bucket path](https://catalog.workshops.aws/sagemaker-hyperpod/en-US/01-cluster/03-s3) referenced in your [`cluster-config.json`](https://catalog.workshops.aws/sagemaker-hyperpod/en-US/01-cluster/04-create-cluster#create-cluster) file. By modifying `config.py` and uploading to S3, this will ensure that any new nodes added or replaced in the HyperPod cluster will also be created with the metric exporter scripts running +To enable these exporter services, modify the [config.py](https://github.com/awslabs/awsome-distributed-ai/blob/main/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config/config.py) file to configure `enable_observability = True`. Save this file, and [upload it to the s3 bucket path](https://catalog.workshops.aws/sagemaker-hyperpod/en-US/01-cluster/03-s3) referenced in your [`cluster-config.json`](https://catalog.workshops.aws/sagemaker-hyperpod/en-US/01-cluster/04-create-cluster#create-cluster) file. By modifying `config.py` and uploading to S3, this will ensure that any new nodes added or replaced in the HyperPod cluster will also be created with the metric exporter scripts running If you have already created your HyperPod cluster, you can follow [these instructions](https://catalog.workshops.aws/sagemaker-hyperpod/en-US/06-observability/09-update) to update your existing HyperPod cluster with Observability. @@ -52,11 +52,11 @@ If you have already created your HyperPod cluster, you can follow [these instruc ### Deploy the CloudFormation Stack -[
1-Click Deploy
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-training.s3.amazonaws.com/templates/cluster-observability.yaml&stackName=Cluster-Observability) +[
1-Click Deploy
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-ai.s3.amazonaws.com/templates/cluster-observability.yaml&stackName=Cluster-Observability) Alternatively, you can deploy OS Grafana stack. -[
1-Click Deploy
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-training.s3.amazonaws.com/templates/cluster-observability-os-grafana.yaml&stackName=Cluster-Observability-OS-Grafana) +[
1-Click Deploy
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-ai.s3.amazonaws.com/templates/cluster-observability-os-grafana.yaml&stackName=Cluster-Observability-OS-Grafana) >[!IMPORTANT] > It is strongly recommended you deploy this stack into the same region and same account as your SageMaker HyperPod Cluster.This will ensure successful execution of the Lifecycle Scripts, specifically `install_prometheus.sh`, which relies on AWS CLI commands that assume same account and same region. diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/Observability-Architecture.png b/validation_and_observability/prometheus-grafana/assets/Observability-Architecture.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/Observability-Architecture.png rename to validation_and_observability/prometheus-grafana/assets/Observability-Architecture.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/add-to-channel.png b/validation_and_observability/prometheus-grafana/assets/add-to-channel.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/add-to-channel.png rename to validation_and_observability/prometheus-grafana/assets/add-to-channel.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/alert-firing.png b/validation_and_observability/prometheus-grafana/assets/alert-firing.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/alert-firing.png rename to validation_and_observability/prometheus-grafana/assets/alert-firing.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/alert-test.png b/validation_and_observability/prometheus-grafana/assets/alert-test.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/alert-test.png rename to validation_and_observability/prometheus-grafana/assets/alert-test.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/alert-threshold.png b/validation_and_observability/prometheus-grafana/assets/alert-threshold.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/alert-threshold.png rename to validation_and_observability/prometheus-grafana/assets/alert-threshold.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/configure-contact-point.png b/validation_and_observability/prometheus-grafana/assets/configure-contact-point.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/configure-contact-point.png rename to validation_and_observability/prometheus-grafana/assets/configure-contact-point.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/configure_grafana_alerting.png b/validation_and_observability/prometheus-grafana/assets/configure_grafana_alerting.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/configure_grafana_alerting.png rename to validation_and_observability/prometheus-grafana/assets/configure_grafana_alerting.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/create-slack-app.png b/validation_and_observability/prometheus-grafana/assets/create-slack-app.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/create-slack-app.png rename to validation_and_observability/prometheus-grafana/assets/create-slack-app.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/dcgm-dashboard.png b/validation_and_observability/prometheus-grafana/assets/dcgm-dashboard.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/dcgm-dashboard.png rename to validation_and_observability/prometheus-grafana/assets/dcgm-dashboard.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/efa-node-dashboard.png b/validation_and_observability/prometheus-grafana/assets/efa-node-dashboard.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/efa-node-dashboard.png rename to validation_and_observability/prometheus-grafana/assets/efa-node-dashboard.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/enable_grafana_alerting.png b/validation_and_observability/prometheus-grafana/assets/enable_grafana_alerting.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/enable_grafana_alerting.png rename to validation_and_observability/prometheus-grafana/assets/enable_grafana_alerting.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/eval-threshold.png b/validation_and_observability/prometheus-grafana/assets/eval-threshold.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/eval-threshold.png rename to validation_and_observability/prometheus-grafana/assets/eval-threshold.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/gpu-health-alert.png b/validation_and_observability/prometheus-grafana/assets/gpu-health-alert.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/gpu-health-alert.png rename to validation_and_observability/prometheus-grafana/assets/gpu-health-alert.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/gpu-health-by-node.png b/validation_and_observability/prometheus-grafana/assets/gpu-health-by-node.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/gpu-health-by-node.png rename to validation_and_observability/prometheus-grafana/assets/gpu-health-by-node.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/gpu-health.png b/validation_and_observability/prometheus-grafana/assets/gpu-health.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/gpu-health.png rename to validation_and_observability/prometheus-grafana/assets/gpu-health.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/grafana-datasource-configure.png b/validation_and_observability/prometheus-grafana/assets/grafana-datasource-configure.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/grafana-datasource-configure.png rename to validation_and_observability/prometheus-grafana/assets/grafana-datasource-configure.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/grafana-datasource.png b/validation_and_observability/prometheus-grafana/assets/grafana-datasource.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/grafana-datasource.png rename to validation_and_observability/prometheus-grafana/assets/grafana-datasource.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/grafana-service-token-lambda-function.zip b/validation_and_observability/prometheus-grafana/assets/grafana-service-token-lambda-function.zip similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/grafana-service-token-lambda-function.zip rename to validation_and_observability/prometheus-grafana/assets/grafana-service-token-lambda-function.zip diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/grafana_users.png b/validation_and_observability/prometheus-grafana/assets/grafana_users.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/grafana_users.png rename to validation_and_observability/prometheus-grafana/assets/grafana_users.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/grafana_users_admin.png b/validation_and_observability/prometheus-grafana/assets/grafana_users_admin.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/grafana_users_admin.png rename to validation_and_observability/prometheus-grafana/assets/grafana_users_admin.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/observability-slurm-custom-resource-function.zip b/validation_and_observability/prometheus-grafana/assets/observability-slurm-custom-resource-function.zip similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/observability-slurm-custom-resource-function.zip rename to validation_and_observability/prometheus-grafana/assets/observability-slurm-custom-resource-function.zip diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/observability_architecture.png b/validation_and_observability/prometheus-grafana/assets/observability_architecture.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/observability_architecture.png rename to validation_and_observability/prometheus-grafana/assets/observability_architecture.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/os-grafana-set-datasource1.png b/validation_and_observability/prometheus-grafana/assets/os-grafana-set-datasource1.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/os-grafana-set-datasource1.png rename to validation_and_observability/prometheus-grafana/assets/os-grafana-set-datasource1.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/os-grafana-set-datasource2.png b/validation_and_observability/prometheus-grafana/assets/os-grafana-set-datasource2.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/os-grafana-set-datasource2.png rename to validation_and_observability/prometheus-grafana/assets/os-grafana-set-datasource2.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/os-grafana-set-datasource3.png b/validation_and_observability/prometheus-grafana/assets/os-grafana-set-datasource3.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/os-grafana-set-datasource3.png rename to validation_and_observability/prometheus-grafana/assets/os-grafana-set-datasource3.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/prometheus_running.png b/validation_and_observability/prometheus-grafana/assets/prometheus_running.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/prometheus_running.png rename to validation_and_observability/prometheus-grafana/assets/prometheus_running.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/retrieve-amp-endpoint.png b/validation_and_observability/prometheus-grafana/assets/retrieve-amp-endpoint.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/retrieve-amp-endpoint.png rename to validation_and_observability/prometheus-grafana/assets/retrieve-amp-endpoint.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/slack-app-workspace.png b/validation_and_observability/prometheus-grafana/assets/slack-app-workspace.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/slack-app-workspace.png rename to validation_and_observability/prometheus-grafana/assets/slack-app-workspace.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/slack-scopes.png b/validation_and_observability/prometheus-grafana/assets/slack-scopes.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/slack-scopes.png rename to validation_and_observability/prometheus-grafana/assets/slack-scopes.png diff --git a/4.validation_and_observability/4.prometheus-grafana/assets/slurm-dashboard.png b/validation_and_observability/prometheus-grafana/assets/slurm-dashboard.png similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/assets/slurm-dashboard.png rename to validation_and_observability/prometheus-grafana/assets/slurm-dashboard.png diff --git a/4.validation_and_observability/4.prometheus-grafana/cluster-observability-os-grafana.yaml b/validation_and_observability/prometheus-grafana/cluster-observability-os-grafana.yaml similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/cluster-observability-os-grafana.yaml rename to validation_and_observability/prometheus-grafana/cluster-observability-os-grafana.yaml diff --git a/4.validation_and_observability/4.prometheus-grafana/cluster-observability.yaml b/validation_and_observability/prometheus-grafana/cluster-observability.yaml similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/cluster-observability.yaml rename to validation_and_observability/prometheus-grafana/cluster-observability.yaml diff --git a/4.validation_and_observability/4.prometheus-grafana/dcgm-metrics.csv b/validation_and_observability/prometheus-grafana/dcgm-metrics.csv similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/dcgm-metrics.csv rename to validation_and_observability/prometheus-grafana/dcgm-metrics.csv diff --git a/4.validation_and_observability/4.prometheus-grafana/eks-managed-observability/README.md b/validation_and_observability/prometheus-grafana/eks-managed-observability/README.md similarity index 99% rename from 4.validation_and_observability/4.prometheus-grafana/eks-managed-observability/README.md rename to validation_and_observability/prometheus-grafana/eks-managed-observability/README.md index 4bd52f1bf..fdbbe3520 100644 --- a/4.validation_and_observability/4.prometheus-grafana/eks-managed-observability/README.md +++ b/validation_and_observability/prometheus-grafana/eks-managed-observability/README.md @@ -112,7 +112,7 @@ You should see GPU metrics like `DCGM_FI_DEV_GPU_UTIL`, `DCGM_FI_DEV_GPU_TEMP`, Use the provided script for automated setup: ```bash -cd awsome-distributed-training/4.validation_and_observability/4.prometheus-grafana/eks-managed-observability +cd awsome-distributed-ai/validation_and_observability/prometheus-grafana/eks-managed-observability ./deploy-obs.sh ``` diff --git a/4.validation_and_observability/4.prometheus-grafana/eks-managed-observability/adot-collector-prometheus.yaml b/validation_and_observability/prometheus-grafana/eks-managed-observability/adot-collector-prometheus.yaml similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/eks-managed-observability/adot-collector-prometheus.yaml rename to validation_and_observability/prometheus-grafana/eks-managed-observability/adot-collector-prometheus.yaml diff --git a/4.validation_and_observability/4.prometheus-grafana/eks-managed-observability/cleanup-obs.sh b/validation_and_observability/prometheus-grafana/eks-managed-observability/cleanup-obs.sh similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/eks-managed-observability/cleanup-obs.sh rename to validation_and_observability/prometheus-grafana/eks-managed-observability/cleanup-obs.sh diff --git a/4.validation_and_observability/4.prometheus-grafana/eks-managed-observability/cluster-observability.yaml b/validation_and_observability/prometheus-grafana/eks-managed-observability/cluster-observability.yaml similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/eks-managed-observability/cluster-observability.yaml rename to validation_and_observability/prometheus-grafana/eks-managed-observability/cluster-observability.yaml diff --git a/4.validation_and_observability/4.prometheus-grafana/eks-managed-observability/deploy-obs.sh b/validation_and_observability/prometheus-grafana/eks-managed-observability/deploy-obs.sh similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/eks-managed-observability/deploy-obs.sh rename to validation_and_observability/prometheus-grafana/eks-managed-observability/deploy-obs.sh diff --git a/4.validation_and_observability/4.prometheus-grafana/update-prometheus.sh b/validation_and_observability/prometheus-grafana/update-prometheus.sh similarity index 100% rename from 4.validation_and_observability/4.prometheus-grafana/update-prometheus.sh rename to validation_and_observability/prometheus-grafana/update-prometheus.sh diff --git a/4.validation_and_observability/1.pytorch-env-validation/0.pytorch-screen.Dockerfile b/validation_and_observability/pytorch-env-validation/0.pytorch-screen.Dockerfile similarity index 100% rename from 4.validation_and_observability/1.pytorch-env-validation/0.pytorch-screen.Dockerfile rename to validation_and_observability/pytorch-env-validation/0.pytorch-screen.Dockerfile diff --git a/4.validation_and_observability/1.pytorch-env-validation/1.torch-screen.sbatch b/validation_and_observability/pytorch-env-validation/1.torch-screen.sbatch similarity index 100% rename from 4.validation_and_observability/1.pytorch-env-validation/1.torch-screen.sbatch rename to validation_and_observability/pytorch-env-validation/1.torch-screen.sbatch diff --git a/4.validation_and_observability/1.pytorch-env-validation/README.md b/validation_and_observability/pytorch-env-validation/README.md similarity index 98% rename from 4.validation_and_observability/1.pytorch-env-validation/README.md rename to validation_and_observability/pytorch-env-validation/README.md index 7dc207942..6222dbe57 100644 --- a/4.validation_and_observability/1.pytorch-env-validation/README.md +++ b/validation_and_observability/pytorch-env-validation/README.md @@ -16,7 +16,7 @@ This guide assumes that you have the following: - Enroot requires libmd to compile and squashfs-tools to execute. - A shared directory mounted on `/apps` -It is recommended that you use the templates in the architectures [directory](../../1.architectures) to deploy Slurm (for example AWS ParallelCluster). +It is recommended that you use the templates in the architectures [directory](../../architectures) to deploy Slurm (for example AWS ParallelCluster). ## 1. Build the container and the squash file diff --git a/4.validation_and_observability/1.pytorch-env-validation/pytorch-screen.py b/validation_and_observability/pytorch-env-validation/pytorch-screen.py similarity index 100% rename from 4.validation_and_observability/1.pytorch-env-validation/pytorch-screen.py rename to validation_and_observability/pytorch-env-validation/pytorch-screen.py