awslabs · KeitaW · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -4,8 +4,8 @@
 # Enforces that a member of the @awslabs/sagemaker-hyperpod-dev team for HyperPod lifecycle scripts
 # They must approve any PRs that modify files under either base-config directory,
 # including all nested subdirectories and files.
-/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config     @awslabs/hyperpod-lcs-dev
-/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config @awslabs/hyperpod-lcs-dev
+/architectures/sagemaker-hyperpod-slurm/LifecycleScripts/base-config     @awslabs/hyperpod-lcs-dev
+/architectures/sagemaker-hyperpod-eks/LifecycleScripts/base-config @awslabs/hyperpod-lcs-dev
 
 # The blog PR gate runs with secrets via pull_request_target. Only blog
 # maintainers may modify it — this prevents a malicious edit from turning

diff --git a/.github/ISSUE_TEMPLATE/200-bug-report.yml b/.github/ISSUE_TEMPLATE/200-bug-report.yml
@@ -6,7 +6,7 @@ body:
   - type: markdown
     attributes:
       value: >
-        #### Before submitting a bug report, please make sure you have searched [existing issues](https://github.com/awslabs/awsome-distributed-training/issues).
+        #### Before submitting a bug report, please make sure you have searched [existing issues](https://github.com/awslabs/awsome-distributed-ai/issues).
 
 
         **IMPORTANT:** Please redact any access keys, secret keys, session tokens,

diff --git a/.github/ISSUE_TEMPLATE/300-ci-failure.yml b/.github/ISSUE_TEMPLATE/300-ci-failure.yml
@@ -24,7 +24,7 @@ body:
     attributes:
       label: GitHub Actions Run URL
       description: Link to the failing GitHub Actions run.
-      placeholder: https://github.com/awslabs/awsome-distributed-training/actions/runs/...
+      placeholder: https://github.com/awslabs/awsome-distributed-ai/actions/runs/...
     validations:
       required: false
   - type: dropdown

diff --git a/.github/ISSUE_TEMPLATE/400-feature-request.yml b/.github/ISSUE_TEMPLATE/400-feature-request.yml
@@ -8,7 +8,7 @@ body:
       value: >
         Thank you for suggesting a feature! For major changes that affect the project's
         architecture or direction, please consider using the
-        [RFC template](https://github.com/awslabs/awsome-distributed-training/issues/new?template=600-RFC.yml) instead.
+        [RFC template](https://github.com/awslabs/awsome-distributed-ai/issues/new?template=600-RFC.yml) instead.
   - type: textarea
     id: description
     attributes:

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -31,7 +31,7 @@
 <!-- If adding or updating a test case, ensure it follows the expected layout below. -->
 
 ```
-3.test_cases/
+examples/
 └── <framework>/                # e.g. pytorch, megatron, jax
     └── <library>/              # e.g. picotron, FSDP, megatron-lm
         └── <model>/            # e.g. SmolLM-1.7B (may be omitted for single-model cases)
@@ -48,7 +48,7 @@
 
 ## Checklist
 
-- [ ] I have read the [contributing guidelines](https://github.com/awslabs/awsome-distributed-training/blob/main/CONTRIBUTING.md).
+- [ ] I have read the [contributing guidelines](https://github.com/awslabs/awsome-distributed-ai/blob/main/CONTRIBUTING.md).
 - [ ] I am working against the latest `main` branch.
 - [ ] I have searched existing open and recently merged PRs to confirm this is not a duplicate.
 - [ ] The contribution is self-contained with documentation and scripts.

diff --git a/.github/workflows/fsdp-eks-regression.yml b/.github/workflows/fsdp-eks-regression.yml
@@ -3,7 +3,7 @@ name: FSDP Regression Test (EKS)
 on: 
   pull_request:
     paths:
-      - '3.test_cases/pytorch/FSDP/**'
+      - 'examples/training/fsdp/**'
 
   workflow_dispatch:
 
@@ -26,7 +26,7 @@ jobs:
       - name: Set env vars
         run: |
           BUILD_ID="${{ github.run_id }}"
-          FSDP_DIR="$(pwd)/${BUILD_ID}/3.test_cases/pytorch/FSDP"
+          FSDP_DIR="$(pwd)/${BUILD_ID}/examples/training/fsdp"
 
           # Set instance specific variables
           if [[ "${{ matrix.cluster }}" == "p5-eks" ]]; then

diff --git a/.github/workflows/fsdp-regression-test-container.yml b/.github/workflows/fsdp-regression-test-container.yml
@@ -6,10 +6,10 @@ on:
   push:
     branches: [ "main" ]
     paths: 
-      - '3.test_cases/pytorch/FSDP/**'
+      - 'examples/training/fsdp/**'
   pull_request:
     paths:
-      - '3.test_cases/pytorch/FSDP/**'
+      - 'examples/training/fsdp/**'
 
   workflow_dispatch:
 
@@ -95,7 +95,7 @@ jobs:
         id: build
         working-directory: source-code
         run: |
-          FSDP_DIR="${{ env.REMOTE_BUILD_PATH }}/3.test_cases/pytorch/FSDP"
+          FSDP_DIR="${{ env.REMOTE_BUILD_PATH }}/examples/training/fsdp"
           ENROOT_IMAGE="${{ env.BASE_PATH }}/enroot-images/fsdp-${{ github.run_id }}-${{ matrix.cluster }}.sqsh"
 
           echo "Building FSDP image on cluster..."
@@ -208,7 +208,7 @@ EOF
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
-          FSDP_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP"
+          FSDP_DIR="${{ env.REMOTE_TEST_PATH }}/examples/training/fsdp"
           SBATCH_FILE="slurm/${{ matrix.model_config }}-training.sbatch"
           TMP_SBATCH="slurm/regression_test_${{ matrix.model_config }}_${{ matrix.cluster }}.sbatch"
 

diff --git a/.github/workflows/fsdp-regression-test-venv.yml b/.github/workflows/fsdp-regression-test-venv.yml
@@ -6,10 +6,10 @@ on:
   push:
     branches: [ "main" ]
     paths: 
-      - '3.test_cases/pytorch/FSDP/**'
+      - 'examples/training/fsdp/**'
   pull_request:
     paths:
-      - '3.test_cases/pytorch/FSDP/**'
+      - 'examples/training/fsdp/**'
 
   workflow_dispatch:
 
@@ -107,7 +107,7 @@ EOF
 
       - name: Create Virtual Environment on Cluster
         run: |
-          FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP/slurm"
+          FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/examples/training/fsdp/slurm"
 
           echo "Creating virtual environment on cluster..."
           ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
@@ -123,7 +123,7 @@ EOF
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
-          FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP/slurm"
+          FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/examples/training/fsdp/slurm"
           SBATCH_FILE="${{ matrix.model_config }}-training.sbatch"
           TMP_SBATCH="regression_test_${{ matrix.model_config }}.sbatch"
 

diff --git a/.github/workflows/megatron-ci-slurm.yaml b/.github/workflows/megatron-ci-slurm.yaml
@@ -6,10 +6,10 @@ on:
   push:
     branches: [ "main" ]
     paths: 
-      - '3.test_cases/megatron/megatron-lm/**'
+      - 'examples/training/megatron-lm/**'
   pull_request:
     paths:
-      - '3.test_cases/megatron/megatron-lm/**'
+      - 'examples/training/megatron-lm/**'
 
   workflow_dispatch:
 
@@ -95,7 +95,7 @@ jobs:
         id: build
         working-directory: source-code
         run: |
-          MEGATRON_DIR="${{ env.REMOTE_BUILD_PATH }}/3.test_cases/megatron/megatron-lm"
+          MEGATRON_DIR="${{ env.REMOTE_BUILD_PATH }}/examples/training/megatron-lm"
           ENROOT_IMAGE="${{ env.BASE_PATH }}/enroot-images/megatron-${{ github.run_id }}-${{ matrix.cluster }}.sqsh"
 
           echo "Building Megatron-LM image on cluster..."

diff --git a/.github/workflows/pr-review-and-slurm-test.yml b/.github/workflows/pr-review-and-slurm-test.yml
@@ -10,7 +10,7 @@ env:
   AWS_REGION: us-east-1
   SLURM_HOST: p5en.smml.aiml.aws.dev
   SLURM_USER: ghactions
-  RESULTS_PATH: /fsx/agents/pr-reviews/awsome-distributed-training
+  RESULTS_PATH: /fsx/agents/pr-reviews/awsome-distributed-ai
   AWS_ROLE_ARN: arn:aws:iam::159553542841:role/awslabs-AOSH-GitHubActionsRole
 
 permissions:

diff --git a/.gitignore b/.gitignore
@@ -39,7 +39,7 @@ downloads/
 eggs/
 .eggs/
 lib/
-!4.validation_and_observability/2.gpu-cluster-healthcheck/lib/
+!validation_and_observability/gpu-cluster-healthcheck/lib/
 lib64/
 parts/
 sdist/

diff --git a/README.md b/README.md
@@ -5,14 +5,14 @@ This repository contains reference architectures and test cases for distributed
 The major components of this directory are:
 
 ```
-├── 1.architectures/               # CloudFormation templates for reference architectures
-├── 2.ami_and_containers/          # Scripts to create AMIs and container images
-├── 3.test_cases/                  # Reference test cases and/or benchmark scripts
-├── 4.validation_and_observability/# Tools to measure performance or troubleshoot
+├── architectures/               # CloudFormation templates for reference architectures
+├── ami_and_containers/          # Scripts to create AMIs and container images
+├── examples/                  # Reference test cases and/or benchmark scripts
+├── validation_and_observability/# Tools to measure performance or troubleshoot
 └── micro-benchmarks/              # Micro-benchmarks (NCCL, NCCOM, NVSHMEM, etc.)
 ```
 
-**NOTE**: The architectures are designed to work with the S3 bucket and VPC created using reference templates `1.architectures/0.common/` and `1.architectures/1.vpc_network/`. _You're strongly recommended to deploy these two templates **before** deploying any of the reference architectures._
+**NOTE**: The architectures are designed to work with the S3 bucket and VPC created using reference templates `architectures/common/` and `architectures/vpc_network/`. _You're strongly recommended to deploy these two templates **before** deploying any of the reference architectures._
 
 ## 0. Workshops
 
@@ -28,64 +28,69 @@ You can follow the workshops below to train models on AWS. Each contains example
 
 Posts about distributed ML training on AWS are published at <https://awslabs.github.io/awsome-distributed/>. The Hugo source lives on the [`content`](https://github.com/awslabs/awsome-distributed/tree/content) branch.
 
-Blog content is editorially curated by AWS authors. Code samples in this repo (`1.architectures/`, `3.test_cases/`, etc.) accept external contributions as usual — see [CONTRIBUTING.md](./CONTRIBUTING.md).
+Blog content is editorially curated by AWS authors. Code samples in this repo (`architectures/`, `examples/`, etc.) accept external contributions as usual — see [CONTRIBUTING.md](./CONTRIBUTING.md).
 
 ## 1. Architectures
 
-Architectures are located in `1.architectures` and consist of utilities and service-related architectures.
+Architectures are located in `architectures` and consist of utilities and service-related architectures.
 
 | Name                                                                           | Category | Usage                                                |
 | ------------------------------------------------------------------------------ | -------- | ---------------------------------------------------- |
-| [`0.common`](./1.architectures/0.common)                                       | Storage  | Common resources (S3 bucket, event notifications)    |
-| [`1.vpc_network`](./1.architectures/1.vpc_network)                             | Network  | Create a VPC with subnets and required resources     |
-| [`2.aws-parallelcluster`](./1.architectures/2.aws-parallelcluster)             | Compute  | Cluster templates for GPU & custom silicon training  |
-| [`3.aws-batch`](./1.architectures/3.aws-batch)                                 | Compute  | AWS Batch template for distributed training          |
-| [`4.amazon-eks`](./1.architectures/4.amazon-eks)                               | Compute  | Manifest files to train with Amazon EKS              |
-| [`5.sagemaker-hyperpod`](./1.architectures/5.sagemaker-hyperpod)               | Compute  | SageMaker HyperPod template for distributed training |
-| [`6.ldap_server`](./1.architectures/6.ldap_server)                             | Identity | LDAP server for multi-user cluster access            |
-| [`7.sagemaker-hyperpod-eks`](./1.architectures/7.sagemaker-hyperpod-eks)       | Compute  | SageMaker HyperPod with EKS orchestration            |
-| [`8.accounting-database`](./1.architectures/8.accounting-database)             | Tooling  | Accounting database for job tracking                 |
+| [`common`](./architectures/common)                                       | Storage  | Common resources (S3 bucket, event notifications)    |
+| [`vpc_network`](./architectures/vpc_network)                             | Network  | Create a VPC with subnets and required resources     |
+| [`aws-parallelcluster`](./architectures/aws-parallelcluster)             | Compute  | Cluster templates for GPU & custom silicon training  |
+| [`aws-batch`](./architectures/aws-batch)                                 | Compute  | AWS Batch template for distributed training          |
+| [`amazon-eks`](./architectures/amazon-eks)                               | Compute  | Manifest files to train with Amazon EKS              |
+| [`sagemaker-hyperpod-slurm`](./architectures/sagemaker-hyperpod-slurm)               | Compute  | SageMaker HyperPod template for distributed training |
+| [`ldap_server`](./architectures/ldap_server)                             | Identity | LDAP server for multi-user cluster access            |
+| [`sagemaker-hyperpod-eks`](./architectures/sagemaker-hyperpod-eks)       | Compute  | SageMaker HyperPod with EKS orchestration            |
+| [`accounting-database`](./architectures/accounting-database)             | Tooling  | Accounting database for job tracking                 |
 | [`aws-pcs`](./architectures/aws-pcs)                                           | Compute  | AWS Parallel Computing Service templates with Slurm scheduler |
 
-You will also find [documentation](./1.architectures/efa-cheatsheet.md) for EFA and the recommended environment variables.
+You will also find [documentation](./architectures/efa-cheatsheet.md) for EFA and the recommended environment variables.
 
 ## 2. Custom Amazon Machine Images
 
 Custom machine images can be built using [Packer](https://www.packer.io) for AWS ParallelCluster, Amazon EKS and plain EC2. These images are based on Ansible roles and playbooks.
 
-## 3. Test Cases
+## 3. Examples
 
-Test cases are organized under `3.test_cases/` by framework (e.g. `pytorch/`, `megatron/`, `jax/`). Within each framework, directories are named after the training library or method (e.g. `picotron/`, `FSDP/`, `megatron-lm/`).
+Examples live under `examples/` and are organized along two axes:
 
-Each test case follows this general structure:
+- **`examples/training/`** and **`examples/inference/`** — *framework-centric*. The training or inference engine is the subject, and model variants underneath illustrate it (e.g. `training/fsdp/`, `training/megatron-lm/`, `training/nemo/`). Swapping the model gives "the same example with a different model."
+- **`examples/use-cases/`** — *use-case-centric*. A specific model or task is the subject and the framework is incidental (e.g. `use-cases/detr-finetune/`, `use-cases/vjepa2/`). Swapping the framework would still leave a recognizable demo.
+
+Each example follows this general structure:
 
 ```
-3.test_cases/
-└── <framework>/                # e.g. pytorch, megatron, jax
-    └── <library>/              # e.g. picotron, FSDP, megatron-lm
-        └── <model>/            # e.g. SmolLM-1.7B (may be omitted for single-model cases)
-            ├── Dockerfile      # Container / environment setup
-            ├── README.md
-            ├── slurm/          # Slurm-specific launch scripts
-            ├── kubernetes/     # Kubernetes manifests
-            └── hyperpod-eks/   # HyperPod EKS instructions
+examples/
+├── training/                   # framework-centric training/fine-tuning engines
+│   └── <framework>/            # e.g. fsdp, deepspeed, megatron-lm, nemo, trl
+│       └── <model>/            # e.g. llama3 (may be omitted for single-model cases)
+│           ├── Dockerfile      # Container / environment setup
+│           ├── README.md
+│           ├── slurm/          # Slurm-specific launch scripts
+│           └── kubernetes/     # Kubernetes manifests
+├── inference/                  # framework-centric inference engines (vllm, …)
+└── use-cases/                  # use-case-centric end-to-end demos
+    └── <name>/                 # e.g. detr-finetune, esm2-hyperpod
 ```
 
-The top-level directory for each test case contains general introduction and environment setup (Dockerfiles, training scripts, configs), while subdirectories provide service-specific launch instructions.
+The top-level directory for each example contains general introduction and environment setup (Dockerfiles, training scripts, configs), while subdirectories provide service-specific launch instructions.
 
-Browse [`3.test_cases/`](./3.test_cases) to see the full list of available frameworks and test cases.
+Browse [`examples/`](./examples) to see the full list of frameworks, engines, and use cases.
 
 ## 4. Validation and Observability
 
-Utility scripts and tools for validating your environment and monitoring training jobs are under `4.validation_and_observability/`.
+Utility scripts and tools for validating your environment and monitoring training jobs are under `validation_and_observability/`.
 
 | Name                                                                                            | Comments                                                        |
 | ----------------------------------------------------------------------------------------------- | --------------------------------------------------------------- |
-| [`1.pytorch-env-validation`](./4.validation_and_observability/1.pytorch-env-validation)         | Validates your PyTorch environment                              |
-| [`2.gpu-cluster-healthcheck`](./4.validation_and_observability/2.gpu-cluster-healthcheck)       | GPU cluster health checks                                       |
-| [`3.efa-node-exporter`](./4.validation_and_observability/3.efa-node-exporter)                   | Node exporter with Amazon EFA monitoring modules                |
-| [`4.prometheus-grafana`](./4.validation_and_observability/4.prometheus-grafana)                  | Monitoring for SageMaker HyperPod and EKS GPU clusters          |
-| [`5.nsight`](./4.validation_and_observability/5.nsight)                                         | Shows how to run Nvidia Nsight Systems to profile your workload |
+| [`pytorch-env-validation`](./validation_and_observability/pytorch-env-validation)         | Validates your PyTorch environment                              |
+| [`gpu-cluster-healthcheck`](./validation_and_observability/gpu-cluster-healthcheck)       | GPU cluster health checks                                       |
+| [`efa-node-exporter`](./validation_and_observability/efa-node-exporter)                   | Node exporter with Amazon EFA monitoring modules                |
+| [`prometheus-grafana`](./validation_and_observability/prometheus-grafana)                  | Monitoring for SageMaker HyperPod and EKS GPU clusters          |
+| [`nsight`](./validation_and_observability/nsight)                                         | Shows how to run Nvidia Nsight Systems to profile your workload |
 
 ## 5. Micro-benchmarks
 

diff --git a/...ontainers/1.amazon_machine_image/Makefile → ..._containers/amazon_machine_image/Makefile b/...ontainers/1.amazon_machine_image/Makefile → ..._containers/amazon_machine_image/Makefile
diff --git a/...ntainers/1.amazon_machine_image/README.md → ...containers/amazon_machine_image/README.md b/...ntainers/1.amazon_machine_image/README.md → ...containers/amazon_machine_image/README.md
diff --git a/...achine_image/inventory/group_vars/all.yml → ...achine_image/inventory/group_vars/all.yml b/...achine_image/inventory/group_vars/all.yml → ...achine_image/inventory/group_vars/all.yml
diff --git a/...rs/1.amazon_machine_image/inventory/hosts → ...ners/amazon_machine_image/inventory/hosts b/...rs/1.amazon_machine_image/inventory/hosts → ...ners/amazon_machine_image/inventory/hosts
diff --git a/...1.amazon_machine_image/packer-ami.pkr.hcl → ...s/amazon_machine_image/packer-ami.pkr.hcl b/...1.amazon_machine_image/packer-ami.pkr.hcl → ...s/amazon_machine_image/packer-ami.pkr.hcl
diff --git a/...azon_machine_image/playbook-dlami-gpu.yml → ...azon_machine_image/playbook-dlami-gpu.yml b/...azon_machine_image/playbook-dlami-gpu.yml → ...azon_machine_image/playbook-dlami-gpu.yml
diff --git a/...n_machine_image/playbook-dlami-neuron.yml → ...n_machine_image/playbook-dlami-neuron.yml b/...n_machine_image/playbook-dlami-neuron.yml → ...n_machine_image/playbook-dlami-neuron.yml
diff --git a/...amazon_machine_image/playbook-eks-gpu.yml → ...amazon_machine_image/playbook-eks-gpu.yml b/...amazon_machine_image/playbook-eks-gpu.yml → ...amazon_machine_image/playbook-eks-gpu.yml
diff --git a/...n_machine_image/playbook-pcluster-cpu.yml → ...n_machine_image/playbook-pcluster-cpu.yml b/...n_machine_image/playbook-pcluster-cpu.yml → ...n_machine_image/playbook-pcluster-cpu.yml
diff --git a/...n_machine_image/playbook-pcluster-gpu.yml → ...n_machine_image/playbook-pcluster-gpu.yml b/...n_machine_image/playbook-pcluster-gpu.yml → ...n_machine_image/playbook-pcluster-gpu.yml
diff --git a/...achine_image/playbook-pcluster-neuron.yml → ...achine_image/playbook-pcluster-neuron.yml b/...achine_image/playbook-pcluster-neuron.yml → ...achine_image/playbook-pcluster-neuron.yml
diff --git a/...s/aws_cliv2/molecule/default/converge.yml → ...s/aws_cliv2/molecule/default/converge.yml b/...s/aws_cliv2/molecule/default/converge.yml → ...s/aws_cliv2/molecule/default/converge.yml
diff --git a/...s/aws_cliv2/molecule/default/molecule.yml → ...s/aws_cliv2/molecule/default/molecule.yml b/...s/aws_cliv2/molecule/default/molecule.yml → ...s/aws_cliv2/molecule/default/molecule.yml
diff --git a/...es/aws_cliv2/molecule/default/prepare.yml → ...es/aws_cliv2/molecule/default/prepare.yml b/...es/aws_cliv2/molecule/default/prepare.yml → ...es/aws_cliv2/molecule/default/prepare.yml
diff --git a/...les/aws_cliv2/molecule/default/verify.yml → ...les/aws_cliv2/molecule/default/verify.yml b/...les/aws_cliv2/molecule/default/verify.yml → ...les/aws_cliv2/molecule/default/verify.yml
diff --git a/...hine_image/roles/aws_cliv2/tasks/main.yml → ...hine_image/roles/aws_cliv2/tasks/main.yml b/...hine_image/roles/aws_cliv2/tasks/main.yml → ...hine_image/roles/aws_cliv2/tasks/main.yml
diff --git a/...ine_image/roles/aws_efa/defaults/main.yml → ...ine_image/roles/aws_efa/defaults/main.yml b/...ine_image/roles/aws_efa/defaults/main.yml → ...ine_image/roles/aws_efa/defaults/main.yml
diff --git a/...achine_image/roles/aws_efa/tasks/main.yml → ...achine_image/roles/aws_efa/tasks/main.yml b/...achine_image/roles/aws_efa/tasks/main.yml → ...achine_image/roles/aws_efa/tasks/main.yml
diff --git a/...ne_image/roles/aws_efa_ofi/tasks/main.yml → ...ne_image/roles/aws_efa_ofi/tasks/main.yml b/...ne_image/roles/aws_efa_ofi/tasks/main.yml → ...ne_image/roles/aws_efa_ofi/tasks/main.yml
diff --git a/..._image/roles/aws_lustre/defaults/main.yml → ..._image/roles/aws_lustre/defaults/main.yml b/..._image/roles/aws_lustre/defaults/main.yml → ..._image/roles/aws_lustre/defaults/main.yml
diff --git a/...ine_image/roles/aws_lustre/tasks/main.yml → ...ine_image/roles/aws_lustre/tasks/main.yml b/...ine_image/roles/aws_lustre/tasks/main.yml → ...ine_image/roles/aws_lustre/tasks/main.yml
diff --git a/...achine_image/roles/base/defaults/main.yml → ...achine_image/roles/base/defaults/main.yml b/...achine_image/roles/base/defaults/main.yml → ...achine_image/roles/base/defaults/main.yml
diff --git a/...n_machine_image/roles/base/tasks/main.yml → ...n_machine_image/roles/base/tasks/main.yml b/...n_machine_image/roles/base/tasks/main.yml → ...n_machine_image/roles/base/tasks/main.yml
diff --git a/...hine_image/roles/docker/defaults/main.yml → ...hine_image/roles/docker/defaults/main.yml b/...hine_image/roles/docker/defaults/main.yml → ...hine_image/roles/docker/defaults/main.yml
diff --git a/...machine_image/roles/docker/tasks/main.yml → ...machine_image/roles/docker/tasks/main.yml b/...machine_image/roles/docker/tasks/main.yml → ...machine_image/roles/docker/tasks/main.yml
diff --git a/...age/roles/neuron_driver/defaults/main.yml → ...age/roles/neuron_driver/defaults/main.yml b/...age/roles/neuron_driver/defaults/main.yml → ...age/roles/neuron_driver/defaults/main.yml
diff --git a/..._image/roles/neuron_driver/tasks/main.yml → ..._image/roles/neuron_driver/tasks/main.yml b/..._image/roles/neuron_driver/tasks/main.yml → ..._image/roles/neuron_driver/tasks/main.yml
diff --git a/...image/roles/nvidia_cuda/defaults/main.yml → ...image/roles/nvidia_cuda/defaults/main.yml b/...image/roles/nvidia_cuda/defaults/main.yml → ...image/roles/nvidia_cuda/defaults/main.yml
diff --git a/...ne_image/roles/nvidia_cuda/tasks/main.yml → ...ne_image/roles/nvidia_cuda/tasks/main.yml b/...ne_image/roles/nvidia_cuda/tasks/main.yml → ...ne_image/roles/nvidia_cuda/tasks/main.yml
diff --git a/...age/roles/nvidia_docker/defaults/main.yml → ...age/roles/nvidia_docker/defaults/main.yml b/...age/roles/nvidia_docker/defaults/main.yml → ...age/roles/nvidia_docker/defaults/main.yml
diff --git a/..._image/roles/nvidia_docker/tasks/main.yml → ..._image/roles/nvidia_docker/tasks/main.yml b/..._image/roles/nvidia_docker/tasks/main.yml → ..._image/roles/nvidia_docker/tasks/main.yml
diff --git a/...age/roles/nvidia_driver/defaults/main.yml → ...age/roles/nvidia_driver/defaults/main.yml b/...age/roles/nvidia_driver/defaults/main.yml → ...age/roles/nvidia_driver/defaults/main.yml
diff --git a/...iles/nvidia-persistenced-override.service → ...iles/nvidia-persistenced-override.service b/...iles/nvidia-persistenced-override.service → ...iles/nvidia-persistenced-override.service
diff --git a/..._image/roles/nvidia_driver/tasks/main.yml → ..._image/roles/nvidia_driver/tasks/main.yml b/..._image/roles/nvidia_driver/tasks/main.yml → ..._image/roles/nvidia_driver/tasks/main.yml
diff --git a/...les/nvidia_enroot_pyxis/defaults/main.yml → ...les/nvidia_enroot_pyxis/defaults/main.yml b/...les/nvidia_enroot_pyxis/defaults/main.yml → ...les/nvidia_enroot_pyxis/defaults/main.yml
diff --git a/.../roles/nvidia_enroot_pyxis/tasks/main.yml → .../roles/nvidia_enroot_pyxis/tasks/main.yml b/.../roles/nvidia_enroot_pyxis/tasks/main.yml → .../roles/nvidia_enroot_pyxis/tasks/main.yml
diff --git a/...nvidia_enroot_pyxis/templates/enroot.conf → ...nvidia_enroot_pyxis/templates/enroot.conf b/...nvidia_enroot_pyxis/templates/enroot.conf → ...nvidia_enroot_pyxis/templates/enroot.conf
diff --git a/...ge/roles/nvidia_gdrcopy/defaults/main.yml → ...ge/roles/nvidia_gdrcopy/defaults/main.yml b/...ge/roles/nvidia_gdrcopy/defaults/main.yml → ...ge/roles/nvidia_gdrcopy/defaults/main.yml
diff --git a/...image/roles/nvidia_gdrcopy/tasks/main.yml → ...image/roles/nvidia_gdrcopy/tasks/main.yml b/...image/roles/nvidia_gdrcopy/tasks/main.yml → ...image/roles/nvidia_gdrcopy/tasks/main.yml
diff --git a/...image/roles/nvidia_nccl/defaults/main.yml → ...image/roles/nvidia_nccl/defaults/main.yml b/...image/roles/nvidia_nccl/defaults/main.yml → ...image/roles/nvidia_nccl/defaults/main.yml
diff --git a/...ne_image/roles/nvidia_nccl/tasks/main.yml → ...ne_image/roles/nvidia_nccl/tasks/main.yml b/...ne_image/roles/nvidia_nccl/tasks/main.yml → ...ne_image/roles/nvidia_nccl/tasks/main.yml
diff --git a/...age/roles/observability/defaults/main.yml → ...age/roles/observability/defaults/main.yml b/...age/roles/observability/defaults/main.yml → ...age/roles/observability/defaults/main.yml
diff --git a/..._image/roles/observability/tasks/main.yml → ..._image/roles/observability/tasks/main.yml b/..._image/roles/observability/tasks/main.yml → ..._image/roles/observability/tasks/main.yml
diff --git a/...ne_image/roles/packages/defaults/main.yml → ...ne_image/roles/packages/defaults/main.yml b/...ne_image/roles/packages/defaults/main.yml → ...ne_image/roles/packages/defaults/main.yml
diff --git a/...chine_image/roles/packages/tasks/main.yml → ...chine_image/roles/packages/tasks/main.yml b/...chine_image/roles/packages/tasks/main.yml → ...chine_image/roles/packages/tasks/main.yml
diff --git a/...ge/roles/pytorch_neuron/defaults/main.yml → ...ge/roles/pytorch_neuron/defaults/main.yml b/...ge/roles/pytorch_neuron/defaults/main.yml → ...ge/roles/pytorch_neuron/defaults/main.yml
diff --git a/...image/roles/pytorch_neuron/tasks/main.yml → ...image/roles/pytorch_neuron/tasks/main.yml b/...image/roles/pytorch_neuron/tasks/main.yml → ...image/roles/pytorch_neuron/tasks/main.yml
diff --git a/...ers/pytorch/0.nvcr-pytorch-aws.dockerfile → ...ers/pytorch/0.nvcr-pytorch-aws.dockerfile b/...ers/pytorch/0.nvcr-pytorch-aws.dockerfile → ...ers/pytorch/0.nvcr-pytorch-aws.dockerfile
@@ -29,7 +29,7 @@ ENV AWS_OFI_NCCL_VERSION=1.12.1-aws
 ENV NCCL_TESTS_VERSION=master
 
 ## Uncomment below when this Dockerfile builds a container image with efa-installer<1.29.1 and
-# nccl>=2.19.0. See https://github.com/awslabs/awsome-distributed-training/tree/main/1.architectures/efa-cheatsheet.md
+# nccl>=2.19.0. See https://github.com/awslabs/awsome-distributed-ai/tree/main/architectures/efa-cheatsheet.md
 #ENV FI_EFA_SET_CUDA_SYNC_MEMOPS=0
 
 RUN apt-get update -y

diff --git a/...d_containers/containers/pytorch/README.md → ...d_containers/containers/pytorch/README.md b/...d_containers/containers/pytorch/README.md → ...d_containers/containers/pytorch/README.md
diff --git a/...reate_dlami/01.dlami-ub2004-base-gpu.yaml → ...reate_dlami/01.dlami-ub2004-base-gpu.yaml b/...reate_dlami/01.dlami-ub2004-base-gpu.yaml → ...reate_dlami/01.dlami-ub2004-base-gpu.yaml
diff --git a/...te_dlami/02.dlami-ub2004-pytorch-gpu.yaml → ...te_dlami/02.dlami-ub2004-pytorch-gpu.yaml b/...te_dlami/02.dlami-ub2004-pytorch-gpu.yaml → ...te_dlami/02.dlami-ub2004-pytorch-gpu.yaml
diff --git a/...tainers/3.pcluster_create_dlami/README.md → ...ontainers/pcluster_create_dlami/README.md b/...tainers/3.pcluster_create_dlami/README.md → ...ontainers/pcluster_create_dlami/README.md
diff --git a/2.ami_and_containers/tools/ec2md/README.md → ami_and_containers/tools/ec2md/README.md b/2.ami_and_containers/tools/ec2md/README.md → ami_and_containers/tools/ec2md/README.md
diff --git a/2.ami_and_containers/tools/ec2md/ec2md.sh → ami_and_containers/tools/ec2md/ec2md.sh b/2.ami_and_containers/tools/ec2md/ec2md.sh → ami_and_containers/tools/ec2md/ec2md.sh
diff --git a/...itectures/8.accounting-database/README.md → architectures/accounting-database/README.md b/...itectures/8.accounting-database/README.md → architectures/accounting-database/README.md
diff --git a/...ting-database/cf_database-accounting.yaml → ...ting-database/cf_database-accounting.yaml b/...ting-database/cf_database-accounting.yaml → ...ting-database/cf_database-accounting.yaml
diff --git a/1.architectures/4.amazon-eks/README.md → architectures/amazon-eks/README.md b/1.architectures/4.amazon-eks/README.md → architectures/amazon-eks/README.md
@@ -1,7 +1,7 @@
 
 # Amazon EKS distributed training architecture
 
-This project provides several reference architectures to run distributed training on Amazon EKS for different use cases using `p4d.24xlarge` instances (you can replace them by `p5` or `trn1`. These examples use [eksctl](eksctl.io) and a cluster manifest to create your specified Amazon EKS cluster.
+This project provides several reference architectures to run distributed training on Amazon EKS for different use cases using `p4d.24xlarge` instances (you can replace them by `p5` or `trn1`. These examples use [eksctl](https://eksctl.io) and a cluster manifest to create your specified Amazon EKS cluster.
 
 ## 0. Prerequisites
 
@@ -15,7 +15,7 @@ To deploy the architectures you must install the dependencies below. You are adv
 
 The following digram shows a common architecture that can be used for distributed model training on EKS.
 
-<img align="center" src="../../0.docs/eks-model-training-single-az.png" width="60%" />
+<img align="center" src="../../assets/eks-model-training-single-az.png" width="60%" />
 
 The EKS cluster has two nodegroups. A `system` nodegroup is used to run pods like kube-dns, kubeflow training operator, etc. which provide internal cluster-scope services and can run on CPU. A worker nodegroup built with an accelerated instance type is used to run the distributed training workload. 
 

diff --git a/...hitectures/4.amazon-eks/eks-g4dn-vpc.yaml → architectures/amazon-eks/eks-g4dn-vpc.yaml b/...hitectures/4.amazon-eks/eks-g4dn-vpc.yaml → architectures/amazon-eks/eks-g4dn-vpc.yaml
diff --git a/1.architectures/4.amazon-eks/eks-g4dn.yaml → architectures/amazon-eks/eks-g4dn.yaml b/1.architectures/4.amazon-eks/eks-g4dn.yaml → architectures/amazon-eks/eks-g4dn.yaml
diff --git a/.../4.amazon-eks/eks-g5-node-autorepair.yaml → ...es/amazon-eks/eks-g5-node-autorepair.yaml b/.../4.amazon-eks/eks-g5-node-autorepair.yaml → ...es/amazon-eks/eks-g5-node-autorepair.yaml
diff --git a/...tures/4.amazon-eks/eks-p4de-odcr-vpc.yaml → ...ectures/amazon-eks/eks-p4de-odcr-vpc.yaml b/...tures/4.amazon-eks/eks-p4de-odcr-vpc.yaml → ...ectures/amazon-eks/eks-p4de-odcr-vpc.yaml
diff --git a/...itectures/4.amazon-eks/eks-p4de-odcr.yaml → architectures/amazon-eks/eks-p4de-odcr.yaml b/...itectures/4.amazon-eks/eks-p4de-odcr.yaml → architectures/amazon-eks/eks-p4de-odcr.yaml
diff --git a/...s/4.amazon-eks/eks-p5-capacity-block.yaml → ...res/amazon-eks/eks-p5-capacity-block.yaml b/...s/4.amazon-eks/eks-p5-capacity-block.yaml → ...res/amazon-eks/eks-p5-capacity-block.yaml
diff --git a/...ectures/4.amazon-eks/eks-p5-odcr-vpc.yaml → ...itectures/amazon-eks/eks-p5-odcr-vpc.yaml b/...ectures/4.amazon-eks/eks-p5-odcr-vpc.yaml → ...itectures/amazon-eks/eks-p5-odcr-vpc.yaml
diff --git a/.../0.aws-batch-distributed-training-p5.yaml → .../0.aws-batch-distributed-training-p5.yaml b/.../0.aws-batch-distributed-training-p5.yaml → .../0.aws-batch-distributed-training-p5.yaml
diff --git a/...tch/0.aws-batch-distributed-training.yaml → ...tch/0.aws-batch-distributed-training.yaml b/...tch/0.aws-batch-distributed-training.yaml → ...tch/0.aws-batch-distributed-training.yaml
-Original file line number
+Diff line change
@@ Expand Up / @@ -39,7 +39,7 @@ downloads/ @@
     eggs/
     .eggs/
     lib/
-    !4.validation_and_observability/2.gpu-cluster-healthcheck/lib/
+    !validation_and_observability/gpu-cluster-healthcheck/lib/
     lib64/
     parts/
     sdist/
@@ Expand Down @@